aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Salyzyn <mark_salyzyn@xyratex.com>2012-01-17 11:52:24 -0500
committerJames Bottomley <JBottomley@Parallels.com>2012-02-19 09:08:52 -0500
commit5954d7380f627371c4d8d7c59c08f9596aa2c674 (patch)
tree0908dd4a832ba0ef07c79237f9f6ab3f2ad20a36
parentd95d00016f8f51dc502cadb263d861bd8c0212bb (diff)
[SCSI] pm8001: deficient responses to IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT
IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT are deficient of the required actions as outlined in the programming manual for the pm8001. Due to the overlapping code requirements of these recovery responses, we found it necessary to bundle them together into one patch. When a break is received during the command phase (ssp_completion), this is a result of a timeout or interruption on the bus. Logic suggests that we should retry the command. When a break is received during the data-phase (ssp_event), the task must be aborted on the target or it will retain a data-phase lock turning the target reticent to all future media commands yet will successfully respond to TUR, INQUIRY and ABORT leading eventually to target failure through several abort-cycle loops. The open retry interval is exceedingly short resulting in occasional target drop-off during expander resets or when targets push-back during bad-block remapping. Increased effective timeout from 130ms to 1.5 seconds for each try so as to trigger after the administrative inquiry/tur timeout in the scsi subsystem to keep error-recovery harmonics to a minimum. When an open retry timeout event is received, the action required by the targets is to issue an abort for the outstanding command then logic suggests we retry the command as this state is usually an indication of a credit block or busy condition on the target. We hijacked the pm8001_handle_event work queue handler so that it will handle task as an argument instead of device for the workers in support of the deferred handling outlined above. Moderate to Heavy bad-path testing on a 2.6.32 vintage kernel, compile-testing on scsi-misc-2.6 kernel ... Signed-off-by: Mark Salyzyn <mark_salyzyn@xyratex.com> Acked-by: Jack Wang <jack_wang@usish.com> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
-rw-r--r--drivers/scsi/pm8001/pm8001_hwi.c196
-rw-r--r--drivers/scsi/pm8001/pm8001_sas.c72
-rw-r--r--drivers/scsi/pm8001/pm8001_sas.h5
3 files changed, 261 insertions, 12 deletions
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index f3c44b96c1c9..3920b49f4f57 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -622,7 +622,8 @@ static int __devinit pm8001_chip_init(struct pm8001_hba_info *pm8001_ha)
622 update_inbnd_queue_table(pm8001_ha, 0); 622 update_inbnd_queue_table(pm8001_ha, 0);
623 update_outbnd_queue_table(pm8001_ha, 0); 623 update_outbnd_queue_table(pm8001_ha, 0);
624 mpi_set_phys_g3_with_ssc(pm8001_ha, 0); 624 mpi_set_phys_g3_with_ssc(pm8001_ha, 0);
625 mpi_set_open_retry_interval_reg(pm8001_ha, 7); 625 /* 7->130ms, 34->500ms, 119->1.5s */
626 mpi_set_open_retry_interval_reg(pm8001_ha, 119);
626 /* notify firmware update finished and check initialization status */ 627 /* notify firmware update finished and check initialization status */
627 if (0 == mpi_init_check(pm8001_ha)) { 628 if (0 == mpi_init_check(pm8001_ha)) {
628 PM8001_INIT_DBG(pm8001_ha, 629 PM8001_INIT_DBG(pm8001_ha,
@@ -1421,24 +1422,191 @@ static void pm8001_work_fn(struct work_struct *work)
1421 struct pm8001_device *pm8001_dev; 1422 struct pm8001_device *pm8001_dev;
1422 struct domain_device *dev; 1423 struct domain_device *dev;
1423 1424
1425 /*
1426 * So far, all users of this stash an associated structure here.
1427 * If we get here, and this pointer is null, then the action
1428 * was cancelled. This nullification happens when the device
1429 * goes away.
1430 */
1431 pm8001_dev = pw->data; /* Most stash device structure */
1432 if ((pm8001_dev == NULL)
1433 || ((pw->handler != IO_XFER_ERROR_BREAK)
1434 && (pm8001_dev->dev_type == NO_DEVICE))) {
1435 kfree(pw);
1436 return;
1437 }
1438
1424 switch (pw->handler) { 1439 switch (pw->handler) {
1440 case IO_XFER_ERROR_BREAK:
1441 { /* This one stashes the sas_task instead */
1442 struct sas_task *t = (struct sas_task *)pm8001_dev;
1443 u32 tag;
1444 struct pm8001_ccb_info *ccb;
1445 struct pm8001_hba_info *pm8001_ha = pw->pm8001_ha;
1446 unsigned long flags, flags1;
1447 struct task_status_struct *ts;
1448 int i;
1449
1450 if (pm8001_query_task(t) == TMF_RESP_FUNC_SUCC)
1451 break; /* Task still on lu */
1452 spin_lock_irqsave(&pm8001_ha->lock, flags);
1453
1454 spin_lock_irqsave(&t->task_state_lock, flags1);
1455 if (unlikely((t->task_state_flags & SAS_TASK_STATE_DONE))) {
1456 spin_unlock_irqrestore(&t->task_state_lock, flags1);
1457 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1458 break; /* Task got completed by another */
1459 }
1460 spin_unlock_irqrestore(&t->task_state_lock, flags1);
1461
1462 /* Search for a possible ccb that matches the task */
1463 for (i = 0; ccb = NULL, i < PM8001_MAX_CCB; i++) {
1464 ccb = &pm8001_ha->ccb_info[i];
1465 tag = ccb->ccb_tag;
1466 if ((tag != 0xFFFFFFFF) && (ccb->task == t))
1467 break;
1468 }
1469 if (!ccb) {
1470 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1471 break; /* Task got freed by another */
1472 }
1473 ts = &t->task_status;
1474 ts->resp = SAS_TASK_COMPLETE;
1475 /* Force the midlayer to retry */
1476 ts->stat = SAS_QUEUE_FULL;
1477 pm8001_dev = ccb->device;
1478 if (pm8001_dev)
1479 pm8001_dev->running_req--;
1480 spin_lock_irqsave(&t->task_state_lock, flags1);
1481 t->task_state_flags &= ~SAS_TASK_STATE_PENDING;
1482 t->task_state_flags &= ~SAS_TASK_AT_INITIATOR;
1483 t->task_state_flags |= SAS_TASK_STATE_DONE;
1484 if (unlikely((t->task_state_flags & SAS_TASK_STATE_ABORTED))) {
1485 spin_unlock_irqrestore(&t->task_state_lock, flags1);
1486 PM8001_FAIL_DBG(pm8001_ha, pm8001_printk("task 0x%p"
1487 " done with event 0x%x resp 0x%x stat 0x%x but"
1488 " aborted by upper layer!\n",
1489 t, pw->handler, ts->resp, ts->stat));
1490 pm8001_ccb_task_free(pm8001_ha, t, ccb, tag);
1491 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1492 } else {
1493 spin_unlock_irqrestore(&t->task_state_lock, flags1);
1494 pm8001_ccb_task_free(pm8001_ha, t, ccb, tag);
1495 mb();/* in order to force CPU ordering */
1496 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1497 t->task_done(t);
1498 }
1499 } break;
1500 case IO_XFER_OPEN_RETRY_TIMEOUT:
1501 { /* This one stashes the sas_task instead */
1502 struct sas_task *t = (struct sas_task *)pm8001_dev;
1503 u32 tag;
1504 struct pm8001_ccb_info *ccb;
1505 struct pm8001_hba_info *pm8001_ha = pw->pm8001_ha;
1506 unsigned long flags, flags1;
1507 int i, ret = 0;
1508
1509 PM8001_IO_DBG(pm8001_ha,
1510 pm8001_printk("IO_XFER_OPEN_RETRY_TIMEOUT\n"));
1511
1512 ret = pm8001_query_task(t);
1513
1514 PM8001_IO_DBG(pm8001_ha,
1515 switch (ret) {
1516 case TMF_RESP_FUNC_SUCC:
1517 pm8001_printk("...Task on lu\n");
1518 break;
1519
1520 case TMF_RESP_FUNC_COMPLETE:
1521 pm8001_printk("...Task NOT on lu\n");
1522 break;
1523
1524 default:
1525 pm8001_printk("...query task failed!!!\n");
1526 break;
1527 });
1528
1529 spin_lock_irqsave(&pm8001_ha->lock, flags);
1530
1531 spin_lock_irqsave(&t->task_state_lock, flags1);
1532
1533 if (unlikely((t->task_state_flags & SAS_TASK_STATE_DONE))) {
1534 spin_unlock_irqrestore(&t->task_state_lock, flags1);
1535 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1536 if (ret == TMF_RESP_FUNC_SUCC) /* task on lu */
1537 (void)pm8001_abort_task(t);
1538 break; /* Task got completed by another */
1539 }
1540
1541 spin_unlock_irqrestore(&t->task_state_lock, flags1);
1542
1543 /* Search for a possible ccb that matches the task */
1544 for (i = 0; ccb = NULL, i < PM8001_MAX_CCB; i++) {
1545 ccb = &pm8001_ha->ccb_info[i];
1546 tag = ccb->ccb_tag;
1547 if ((tag != 0xFFFFFFFF) && (ccb->task == t))
1548 break;
1549 }
1550 if (!ccb) {
1551 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1552 if (ret == TMF_RESP_FUNC_SUCC) /* task on lu */
1553 (void)pm8001_abort_task(t);
1554 break; /* Task got freed by another */
1555 }
1556
1557 pm8001_dev = ccb->device;
1558 dev = pm8001_dev->sas_device;
1559
1560 switch (ret) {
1561 case TMF_RESP_FUNC_SUCC: /* task on lu */
1562 ccb->open_retry = 1; /* Snub completion */
1563 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1564 ret = pm8001_abort_task(t);
1565 ccb->open_retry = 0;
1566 switch (ret) {
1567 case TMF_RESP_FUNC_SUCC:
1568 case TMF_RESP_FUNC_COMPLETE:
1569 break;
1570 default: /* device misbehavior */
1571 ret = TMF_RESP_FUNC_FAILED;
1572 PM8001_IO_DBG(pm8001_ha,
1573 pm8001_printk("...Reset phy\n"));
1574 pm8001_I_T_nexus_reset(dev);
1575 break;
1576 }
1577 break;
1578
1579 case TMF_RESP_FUNC_COMPLETE: /* task not on lu */
1580 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1581 /* Do we need to abort the task locally? */
1582 break;
1583
1584 default: /* device misbehavior */
1585 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
1586 ret = TMF_RESP_FUNC_FAILED;
1587 PM8001_IO_DBG(pm8001_ha,
1588 pm8001_printk("...Reset phy\n"));
1589 pm8001_I_T_nexus_reset(dev);
1590 }
1591
1592 if (ret == TMF_RESP_FUNC_FAILED)
1593 t = NULL;
1594 pm8001_open_reject_retry(pm8001_ha, t, pm8001_dev);
1595 PM8001_IO_DBG(pm8001_ha, pm8001_printk("...Complete\n"));
1596 } break;
1425 case IO_OPEN_CNX_ERROR_IT_NEXUS_LOSS: 1597 case IO_OPEN_CNX_ERROR_IT_NEXUS_LOSS:
1426 pm8001_dev = pw->data;
1427 dev = pm8001_dev->sas_device; 1598 dev = pm8001_dev->sas_device;
1428 pm8001_I_T_nexus_reset(dev); 1599 pm8001_I_T_nexus_reset(dev);
1429 break; 1600 break;
1430 case IO_OPEN_CNX_ERROR_STP_RESOURCES_BUSY: 1601 case IO_OPEN_CNX_ERROR_STP_RESOURCES_BUSY:
1431 pm8001_dev = pw->data;
1432 dev = pm8001_dev->sas_device; 1602 dev = pm8001_dev->sas_device;
1433 pm8001_I_T_nexus_reset(dev); 1603 pm8001_I_T_nexus_reset(dev);
1434 break; 1604 break;
1435 case IO_DS_IN_ERROR: 1605 case IO_DS_IN_ERROR:
1436 pm8001_dev = pw->data;
1437 dev = pm8001_dev->sas_device; 1606 dev = pm8001_dev->sas_device;
1438 pm8001_I_T_nexus_reset(dev); 1607 pm8001_I_T_nexus_reset(dev);
1439 break; 1608 break;
1440 case IO_DS_NON_OPERATIONAL: 1609 case IO_DS_NON_OPERATIONAL:
1441 pm8001_dev = pw->data;
1442 dev = pm8001_dev->sas_device; 1610 dev = pm8001_dev->sas_device;
1443 pm8001_I_T_nexus_reset(dev); 1611 pm8001_I_T_nexus_reset(dev);
1444 break; 1612 break;
@@ -1493,6 +1661,11 @@ mpi_ssp_completion(struct pm8001_hba_info *pm8001_ha , void *piomb)
1493 status = le32_to_cpu(psspPayload->status); 1661 status = le32_to_cpu(psspPayload->status);
1494 tag = le32_to_cpu(psspPayload->tag); 1662 tag = le32_to_cpu(psspPayload->tag);
1495 ccb = &pm8001_ha->ccb_info[tag]; 1663 ccb = &pm8001_ha->ccb_info[tag];
1664 if ((status == IO_ABORTED) && ccb->open_retry) {
1665 /* Being completed by another */
1666 ccb->open_retry = 0;
1667 return;
1668 }
1496 pm8001_dev = ccb->device; 1669 pm8001_dev = ccb->device;
1497 param = le32_to_cpu(psspPayload->param); 1670 param = le32_to_cpu(psspPayload->param);
1498 1671
@@ -1548,6 +1721,8 @@ mpi_ssp_completion(struct pm8001_hba_info *pm8001_ha , void *piomb)
1548 pm8001_printk("IO_XFER_ERROR_BREAK\n")); 1721 pm8001_printk("IO_XFER_ERROR_BREAK\n"));
1549 ts->resp = SAS_TASK_COMPLETE; 1722 ts->resp = SAS_TASK_COMPLETE;
1550 ts->stat = SAS_OPEN_REJECT; 1723 ts->stat = SAS_OPEN_REJECT;
1724 /* Force the midlayer to retry */
1725 ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
1551 break; 1726 break;
1552 case IO_XFER_ERROR_PHY_NOT_READY: 1727 case IO_XFER_ERROR_PHY_NOT_READY:
1553 PM8001_IO_DBG(pm8001_ha, 1728 PM8001_IO_DBG(pm8001_ha,
@@ -1752,9 +1927,8 @@ static void mpi_ssp_event(struct pm8001_hba_info *pm8001_ha , void *piomb)
1752 case IO_XFER_ERROR_BREAK: 1927 case IO_XFER_ERROR_BREAK:
1753 PM8001_IO_DBG(pm8001_ha, 1928 PM8001_IO_DBG(pm8001_ha,
1754 pm8001_printk("IO_XFER_ERROR_BREAK\n")); 1929 pm8001_printk("IO_XFER_ERROR_BREAK\n"));
1755 ts->resp = SAS_TASK_COMPLETE; 1930 pm8001_handle_event(pm8001_ha, t, IO_XFER_ERROR_BREAK);
1756 ts->stat = SAS_INTERRUPTED; 1931 return;
1757 break;
1758 case IO_XFER_ERROR_PHY_NOT_READY: 1932 case IO_XFER_ERROR_PHY_NOT_READY:
1759 PM8001_IO_DBG(pm8001_ha, 1933 PM8001_IO_DBG(pm8001_ha,
1760 pm8001_printk("IO_XFER_ERROR_PHY_NOT_READY\n")); 1934 pm8001_printk("IO_XFER_ERROR_PHY_NOT_READY\n"));
@@ -1833,10 +2007,8 @@ static void mpi_ssp_event(struct pm8001_hba_info *pm8001_ha , void *piomb)
1833 case IO_XFER_OPEN_RETRY_TIMEOUT: 2007 case IO_XFER_OPEN_RETRY_TIMEOUT:
1834 PM8001_IO_DBG(pm8001_ha, 2008 PM8001_IO_DBG(pm8001_ha,
1835 pm8001_printk("IO_XFER_OPEN_RETRY_TIMEOUT\n")); 2009 pm8001_printk("IO_XFER_OPEN_RETRY_TIMEOUT\n"));
1836 ts->resp = SAS_TASK_COMPLETE; 2010 pm8001_handle_event(pm8001_ha, t, IO_XFER_OPEN_RETRY_TIMEOUT);
1837 ts->stat = SAS_OPEN_REJECT; 2011 return;
1838 ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
1839 break;
1840 case IO_XFER_ERROR_UNEXPECTED_PHASE: 2012 case IO_XFER_ERROR_UNEXPECTED_PHASE:
1841 PM8001_IO_DBG(pm8001_ha, 2013 PM8001_IO_DBG(pm8001_ha,
1842 pm8001_printk("IO_XFER_ERROR_UNEXPECTED_PHASE\n")); 2014 pm8001_printk("IO_XFER_ERROR_UNEXPECTED_PHASE\n"));
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index ab0704e39040..9589fc941a8b 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -538,6 +538,7 @@ void pm8001_ccb_task_free(struct pm8001_hba_info *pm8001_ha,
538 task->lldd_task = NULL; 538 task->lldd_task = NULL;
539 ccb->task = NULL; 539 ccb->task = NULL;
540 ccb->ccb_tag = 0xFFFFFFFF; 540 ccb->ccb_tag = 0xFFFFFFFF;
541 ccb->open_retry = 0;
541 pm8001_ccb_free(pm8001_ha, ccb_idx); 542 pm8001_ccb_free(pm8001_ha, ccb_idx);
542} 543}
543 544
@@ -882,6 +883,77 @@ static int pm8001_issue_ssp_tmf(struct domain_device *dev,
882 tmf); 883 tmf);
883} 884}
884 885
886/* retry commands by ha, by task and/or by device */
887void pm8001_open_reject_retry(
888 struct pm8001_hba_info *pm8001_ha,
889 struct sas_task *task_to_close,
890 struct pm8001_device *device_to_close)
891{
892 int i;
893 unsigned long flags;
894
895 if (pm8001_ha == NULL)
896 return;
897
898 spin_lock_irqsave(&pm8001_ha->lock, flags);
899
900 for (i = 0; i < PM8001_MAX_CCB; i++) {
901 struct sas_task *task;
902 struct task_status_struct *ts;
903 struct pm8001_device *pm8001_dev;
904 unsigned long flags1;
905 u32 tag;
906 struct pm8001_ccb_info *ccb = &pm8001_ha->ccb_info[i];
907
908 pm8001_dev = ccb->device;
909 if (!pm8001_dev || (pm8001_dev->dev_type == NO_DEVICE))
910 continue;
911 if (!device_to_close) {
912 uintptr_t d = (uintptr_t)pm8001_dev
913 - (uintptr_t)&pm8001_ha->devices;
914 if (((d % sizeof(*pm8001_dev)) != 0)
915 || ((d / sizeof(*pm8001_dev)) >= PM8001_MAX_DEVICES))
916 continue;
917 } else if (pm8001_dev != device_to_close)
918 continue;
919 tag = ccb->ccb_tag;
920 if (!tag || (tag == 0xFFFFFFFF))
921 continue;
922 task = ccb->task;
923 if (!task || !task->task_done)
924 continue;
925 if (task_to_close && (task != task_to_close))
926 continue;
927 ts = &task->task_status;
928 ts->resp = SAS_TASK_COMPLETE;
929 /* Force the midlayer to retry */
930 ts->stat = SAS_OPEN_REJECT;
931 ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
932 if (pm8001_dev)
933 pm8001_dev->running_req--;
934 spin_lock_irqsave(&task->task_state_lock, flags1);
935 task->task_state_flags &= ~SAS_TASK_STATE_PENDING;
936 task->task_state_flags &= ~SAS_TASK_AT_INITIATOR;
937 task->task_state_flags |= SAS_TASK_STATE_DONE;
938 if (unlikely((task->task_state_flags
939 & SAS_TASK_STATE_ABORTED))) {
940 spin_unlock_irqrestore(&task->task_state_lock,
941 flags1);
942 pm8001_ccb_task_free(pm8001_ha, task, ccb, tag);
943 } else {
944 spin_unlock_irqrestore(&task->task_state_lock,
945 flags1);
946 pm8001_ccb_task_free(pm8001_ha, task, ccb, tag);
947 mb();/* in order to force CPU ordering */
948 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
949 task->task_done(task);
950 spin_lock_irqsave(&pm8001_ha->lock, flags);
951 }
952 }
953
954 spin_unlock_irqrestore(&pm8001_ha->lock, flags);
955}
956
885/** 957/**
886 * Standard mandates link reset for ATA (type 0) and hard reset for 958 * Standard mandates link reset for ATA (type 0) and hard reset for
887 * SSP (type 1) , only for RECOVERY 959 * SSP (type 1) , only for RECOVERY
diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h
index 83a48f3247b8..11008205aeb3 100644
--- a/drivers/scsi/pm8001/pm8001_sas.h
+++ b/drivers/scsi/pm8001/pm8001_sas.h
@@ -235,6 +235,7 @@ struct pm8001_ccb_info {
235 struct pm8001_device *device; 235 struct pm8001_device *device;
236 struct pm8001_prd buf_prd[PM8001_MAX_DMA_SG]; 236 struct pm8001_prd buf_prd[PM8001_MAX_DMA_SG];
237 struct fw_control_ex *fw_control_context; 237 struct fw_control_ex *fw_control_context;
238 u8 open_retry;
238}; 239};
239 240
240struct mpi_mem { 241struct mpi_mem {
@@ -484,6 +485,10 @@ void pm8001_dev_gone(struct domain_device *dev);
484int pm8001_lu_reset(struct domain_device *dev, u8 *lun); 485int pm8001_lu_reset(struct domain_device *dev, u8 *lun);
485int pm8001_I_T_nexus_reset(struct domain_device *dev); 486int pm8001_I_T_nexus_reset(struct domain_device *dev);
486int pm8001_query_task(struct sas_task *task); 487int pm8001_query_task(struct sas_task *task);
488void pm8001_open_reject_retry(
489 struct pm8001_hba_info *pm8001_ha,
490 struct sas_task *task_to_close,
491 struct pm8001_device *device_to_close);
487int pm8001_mem_alloc(struct pci_dev *pdev, void **virt_addr, 492int pm8001_mem_alloc(struct pci_dev *pdev, void **virt_addr,
488 dma_addr_t *pphys_addr, u32 *pphys_addr_hi, u32 *pphys_addr_lo, 493 dma_addr_t *pphys_addr, u32 *pphys_addr_hi, u32 *pphys_addr_lo,
489 u32 mem_size, u32 align); 494 u32 mem_size, u32 align);