diff options
author | Stefan Weinhuber <wein@de.ibm.com> | 2009-12-07 06:51:51 -0500 |
---|---|---|
committer | Martin Schwidefsky <sky@mschwide.boeblingen.de.ibm.com> | 2009-12-07 06:51:34 -0500 |
commit | eb6e199bef288611157b8198c25d12b32bf058d0 (patch) | |
tree | 80737a2703a9f4d09cee2410342aeccb281413ae /drivers/s390/block/dasd.c | |
parent | 626350b63ef2cd447023d3dc2a34eaa7ca01bfff (diff) |
[S390] dasd: improve error recovery for internal I/O
Most of the error conditions reported by a FICON storage server
indicate situations which can be recovered. Sometimes the host just
needs to retry an I/O request, but sometimes the recovery
is more complex and requires the device driver to wait, choose
a different path, etc.
The DASD device driver has a fully featured error recovery
for normal block layer I/O, but not for internal I/O request which
are for example used during the device bring up.
This can lead to situations where the IPL of a system fails because
DASD devices are not properly recognized.
This patch will extend the internal I/O handling to use the existing
error recovery procedures.
Signed-off-by: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'drivers/s390/block/dasd.c')
-rw-r--r-- | drivers/s390/block/dasd.c | 207 |
1 files changed, 152 insertions, 55 deletions
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 329115a4d4b3..4f211c175b55 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c | |||
@@ -63,6 +63,7 @@ static void do_restore_device(struct work_struct *); | |||
63 | static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *); | 63 | static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *); |
64 | static void dasd_device_timeout(unsigned long); | 64 | static void dasd_device_timeout(unsigned long); |
65 | static void dasd_block_timeout(unsigned long); | 65 | static void dasd_block_timeout(unsigned long); |
66 | static void __dasd_process_erp(struct dasd_device *, struct dasd_ccw_req *); | ||
66 | 67 | ||
67 | /* | 68 | /* |
68 | * SECTION: Operations on the device structure. | 69 | * SECTION: Operations on the device structure. |
@@ -959,7 +960,7 @@ static void dasd_device_timeout(unsigned long ptr) | |||
959 | device = (struct dasd_device *) ptr; | 960 | device = (struct dasd_device *) ptr; |
960 | spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags); | 961 | spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags); |
961 | /* re-activate request queue */ | 962 | /* re-activate request queue */ |
962 | device->stopped &= ~DASD_STOPPED_PENDING; | 963 | dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING); |
963 | spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); | 964 | spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); |
964 | dasd_schedule_device_bh(device); | 965 | dasd_schedule_device_bh(device); |
965 | } | 966 | } |
@@ -1022,7 +1023,7 @@ void dasd_generic_handle_state_change(struct dasd_device *device) | |||
1022 | /* First of all start sense subsystem status request. */ | 1023 | /* First of all start sense subsystem status request. */ |
1023 | dasd_eer_snss(device); | 1024 | dasd_eer_snss(device); |
1024 | 1025 | ||
1025 | device->stopped &= ~DASD_STOPPED_PENDING; | 1026 | dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING); |
1026 | dasd_schedule_device_bh(device); | 1027 | dasd_schedule_device_bh(device); |
1027 | if (device->block) | 1028 | if (device->block) |
1028 | dasd_schedule_block_bh(device->block); | 1029 | dasd_schedule_block_bh(device->block); |
@@ -1404,6 +1405,20 @@ void dasd_schedule_device_bh(struct dasd_device *device) | |||
1404 | tasklet_hi_schedule(&device->tasklet); | 1405 | tasklet_hi_schedule(&device->tasklet); |
1405 | } | 1406 | } |
1406 | 1407 | ||
1408 | void dasd_device_set_stop_bits(struct dasd_device *device, int bits) | ||
1409 | { | ||
1410 | device->stopped |= bits; | ||
1411 | } | ||
1412 | EXPORT_SYMBOL_GPL(dasd_device_set_stop_bits); | ||
1413 | |||
1414 | void dasd_device_remove_stop_bits(struct dasd_device *device, int bits) | ||
1415 | { | ||
1416 | device->stopped &= ~bits; | ||
1417 | if (!device->stopped) | ||
1418 | wake_up(&generic_waitq); | ||
1419 | } | ||
1420 | EXPORT_SYMBOL_GPL(dasd_device_remove_stop_bits); | ||
1421 | |||
1407 | /* | 1422 | /* |
1408 | * Queue a request to the head of the device ccw_queue. | 1423 | * Queue a request to the head of the device ccw_queue. |
1409 | * Start the I/O if possible. | 1424 | * Start the I/O if possible. |
@@ -1464,58 +1479,135 @@ static inline int _wait_for_wakeup(struct dasd_ccw_req *cqr) | |||
1464 | } | 1479 | } |
1465 | 1480 | ||
1466 | /* | 1481 | /* |
1467 | * Queue a request to the tail of the device ccw_queue and wait for | 1482 | * checks if error recovery is necessary, returns 1 if yes, 0 otherwise. |
1468 | * it's completion. | ||
1469 | */ | 1483 | */ |
1470 | int dasd_sleep_on(struct dasd_ccw_req *cqr) | 1484 | static int __dasd_sleep_on_erp(struct dasd_ccw_req *cqr) |
1471 | { | 1485 | { |
1472 | struct dasd_device *device; | 1486 | struct dasd_device *device; |
1473 | int rc; | 1487 | dasd_erp_fn_t erp_fn; |
1474 | 1488 | ||
1489 | if (cqr->status == DASD_CQR_FILLED) | ||
1490 | return 0; | ||
1475 | device = cqr->startdev; | 1491 | device = cqr->startdev; |
1492 | if (test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) { | ||
1493 | if (cqr->status == DASD_CQR_TERMINATED) { | ||
1494 | device->discipline->handle_terminated_request(cqr); | ||
1495 | return 1; | ||
1496 | } | ||
1497 | if (cqr->status == DASD_CQR_NEED_ERP) { | ||
1498 | erp_fn = device->discipline->erp_action(cqr); | ||
1499 | erp_fn(cqr); | ||
1500 | return 1; | ||
1501 | } | ||
1502 | if (cqr->status == DASD_CQR_FAILED) | ||
1503 | dasd_log_sense(cqr, &cqr->irb); | ||
1504 | if (cqr->refers) { | ||
1505 | __dasd_process_erp(device, cqr); | ||
1506 | return 1; | ||
1507 | } | ||
1508 | } | ||
1509 | return 0; | ||
1510 | } | ||
1476 | 1511 | ||
1477 | cqr->callback = dasd_wakeup_cb; | 1512 | static int __dasd_sleep_on_loop_condition(struct dasd_ccw_req *cqr) |
1478 | cqr->callback_data = (void *) &generic_waitq; | 1513 | { |
1479 | dasd_add_request_tail(cqr); | 1514 | if (test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) { |
1480 | wait_event(generic_waitq, _wait_for_wakeup(cqr)); | 1515 | if (cqr->refers) /* erp is not done yet */ |
1516 | return 1; | ||
1517 | return ((cqr->status != DASD_CQR_DONE) && | ||
1518 | (cqr->status != DASD_CQR_FAILED)); | ||
1519 | } else | ||
1520 | return (cqr->status == DASD_CQR_FILLED); | ||
1521 | } | ||
1481 | 1522 | ||
1482 | if (cqr->status == DASD_CQR_DONE) | 1523 | static int _dasd_sleep_on(struct dasd_ccw_req *maincqr, int interruptible) |
1524 | { | ||
1525 | struct dasd_device *device; | ||
1526 | int rc; | ||
1527 | struct list_head ccw_queue; | ||
1528 | struct dasd_ccw_req *cqr; | ||
1529 | |||
1530 | INIT_LIST_HEAD(&ccw_queue); | ||
1531 | maincqr->status = DASD_CQR_FILLED; | ||
1532 | device = maincqr->startdev; | ||
1533 | list_add(&maincqr->blocklist, &ccw_queue); | ||
1534 | for (cqr = maincqr; __dasd_sleep_on_loop_condition(cqr); | ||
1535 | cqr = list_first_entry(&ccw_queue, | ||
1536 | struct dasd_ccw_req, blocklist)) { | ||
1537 | |||
1538 | if (__dasd_sleep_on_erp(cqr)) | ||
1539 | continue; | ||
1540 | if (cqr->status != DASD_CQR_FILLED) /* could be failed */ | ||
1541 | continue; | ||
1542 | |||
1543 | /* Non-temporary stop condition will trigger fail fast */ | ||
1544 | if (device->stopped & ~DASD_STOPPED_PENDING && | ||
1545 | test_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags) && | ||
1546 | (!dasd_eer_enabled(device))) { | ||
1547 | cqr->status = DASD_CQR_FAILED; | ||
1548 | continue; | ||
1549 | } | ||
1550 | |||
1551 | /* Don't try to start requests if device is stopped */ | ||
1552 | if (interruptible) { | ||
1553 | rc = wait_event_interruptible( | ||
1554 | generic_waitq, !(device->stopped)); | ||
1555 | if (rc == -ERESTARTSYS) { | ||
1556 | cqr->status = DASD_CQR_FAILED; | ||
1557 | maincqr->intrc = rc; | ||
1558 | continue; | ||
1559 | } | ||
1560 | } else | ||
1561 | wait_event(generic_waitq, !(device->stopped)); | ||
1562 | |||
1563 | cqr->callback = dasd_wakeup_cb; | ||
1564 | cqr->callback_data = (void *) &generic_waitq; | ||
1565 | dasd_add_request_tail(cqr); | ||
1566 | if (interruptible) { | ||
1567 | rc = wait_event_interruptible( | ||
1568 | generic_waitq, _wait_for_wakeup(cqr)); | ||
1569 | if (rc == -ERESTARTSYS) { | ||
1570 | dasd_cancel_req(cqr); | ||
1571 | /* wait (non-interruptible) for final status */ | ||
1572 | wait_event(generic_waitq, | ||
1573 | _wait_for_wakeup(cqr)); | ||
1574 | cqr->status = DASD_CQR_FAILED; | ||
1575 | maincqr->intrc = rc; | ||
1576 | continue; | ||
1577 | } | ||
1578 | } else | ||
1579 | wait_event(generic_waitq, _wait_for_wakeup(cqr)); | ||
1580 | } | ||
1581 | |||
1582 | maincqr->endclk = get_clock(); | ||
1583 | if ((maincqr->status != DASD_CQR_DONE) && | ||
1584 | (maincqr->intrc != -ERESTARTSYS)) | ||
1585 | dasd_log_sense(maincqr, &maincqr->irb); | ||
1586 | if (maincqr->status == DASD_CQR_DONE) | ||
1483 | rc = 0; | 1587 | rc = 0; |
1484 | else if (cqr->intrc) | 1588 | else if (maincqr->intrc) |
1485 | rc = cqr->intrc; | 1589 | rc = maincqr->intrc; |
1486 | else | 1590 | else |
1487 | rc = -EIO; | 1591 | rc = -EIO; |
1488 | return rc; | 1592 | return rc; |
1489 | } | 1593 | } |
1490 | 1594 | ||
1491 | /* | 1595 | /* |
1596 | * Queue a request to the tail of the device ccw_queue and wait for | ||
1597 | * it's completion. | ||
1598 | */ | ||
1599 | int dasd_sleep_on(struct dasd_ccw_req *cqr) | ||
1600 | { | ||
1601 | return _dasd_sleep_on(cqr, 0); | ||
1602 | } | ||
1603 | |||
1604 | /* | ||
1492 | * Queue a request to the tail of the device ccw_queue and wait | 1605 | * Queue a request to the tail of the device ccw_queue and wait |
1493 | * interruptible for it's completion. | 1606 | * interruptible for it's completion. |
1494 | */ | 1607 | */ |
1495 | int dasd_sleep_on_interruptible(struct dasd_ccw_req *cqr) | 1608 | int dasd_sleep_on_interruptible(struct dasd_ccw_req *cqr) |
1496 | { | 1609 | { |
1497 | struct dasd_device *device; | 1610 | return _dasd_sleep_on(cqr, 1); |
1498 | int rc; | ||
1499 | |||
1500 | device = cqr->startdev; | ||
1501 | cqr->callback = dasd_wakeup_cb; | ||
1502 | cqr->callback_data = (void *) &generic_waitq; | ||
1503 | dasd_add_request_tail(cqr); | ||
1504 | rc = wait_event_interruptible(generic_waitq, _wait_for_wakeup(cqr)); | ||
1505 | if (rc == -ERESTARTSYS) { | ||
1506 | dasd_cancel_req(cqr); | ||
1507 | /* wait (non-interruptible) for final status */ | ||
1508 | wait_event(generic_waitq, _wait_for_wakeup(cqr)); | ||
1509 | cqr->intrc = rc; | ||
1510 | } | ||
1511 | |||
1512 | if (cqr->status == DASD_CQR_DONE) | ||
1513 | rc = 0; | ||
1514 | else if (cqr->intrc) | ||
1515 | rc = cqr->intrc; | ||
1516 | else | ||
1517 | rc = -EIO; | ||
1518 | return rc; | ||
1519 | } | 1611 | } |
1520 | 1612 | ||
1521 | /* | 1613 | /* |
@@ -1629,7 +1721,7 @@ static void dasd_block_timeout(unsigned long ptr) | |||
1629 | block = (struct dasd_block *) ptr; | 1721 | block = (struct dasd_block *) ptr; |
1630 | spin_lock_irqsave(get_ccwdev_lock(block->base->cdev), flags); | 1722 | spin_lock_irqsave(get_ccwdev_lock(block->base->cdev), flags); |
1631 | /* re-activate request queue */ | 1723 | /* re-activate request queue */ |
1632 | block->base->stopped &= ~DASD_STOPPED_PENDING; | 1724 | dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); |
1633 | spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); | 1725 | spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); |
1634 | dasd_schedule_block_bh(block); | 1726 | dasd_schedule_block_bh(block); |
1635 | } | 1727 | } |
@@ -1656,11 +1748,10 @@ void dasd_block_clear_timer(struct dasd_block *block) | |||
1656 | /* | 1748 | /* |
1657 | * Process finished error recovery ccw. | 1749 | * Process finished error recovery ccw. |
1658 | */ | 1750 | */ |
1659 | static inline void __dasd_block_process_erp(struct dasd_block *block, | 1751 | static void __dasd_process_erp(struct dasd_device *device, |
1660 | struct dasd_ccw_req *cqr) | 1752 | struct dasd_ccw_req *cqr) |
1661 | { | 1753 | { |
1662 | dasd_erp_fn_t erp_fn; | 1754 | dasd_erp_fn_t erp_fn; |
1663 | struct dasd_device *device = block->base; | ||
1664 | 1755 | ||
1665 | if (cqr->status == DASD_CQR_DONE) | 1756 | if (cqr->status == DASD_CQR_DONE) |
1666 | DBF_DEV_EVENT(DBF_NOTICE, device, "%s", "ERP successful"); | 1757 | DBF_DEV_EVENT(DBF_NOTICE, device, "%s", "ERP successful"); |
@@ -1724,9 +1815,12 @@ static void __dasd_process_request_queue(struct dasd_block *block) | |||
1724 | */ | 1815 | */ |
1725 | if (!list_empty(&block->ccw_queue)) | 1816 | if (!list_empty(&block->ccw_queue)) |
1726 | break; | 1817 | break; |
1727 | spin_lock_irqsave(get_ccwdev_lock(basedev->cdev), flags); | 1818 | spin_lock_irqsave( |
1728 | basedev->stopped |= DASD_STOPPED_PENDING; | 1819 | get_ccwdev_lock(basedev->cdev), flags); |
1729 | spin_unlock_irqrestore(get_ccwdev_lock(basedev->cdev), flags); | 1820 | dasd_device_set_stop_bits(basedev, |
1821 | DASD_STOPPED_PENDING); | ||
1822 | spin_unlock_irqrestore( | ||
1823 | get_ccwdev_lock(basedev->cdev), flags); | ||
1730 | dasd_block_set_timer(block, HZ/2); | 1824 | dasd_block_set_timer(block, HZ/2); |
1731 | break; | 1825 | break; |
1732 | } | 1826 | } |
@@ -1812,7 +1906,7 @@ restart: | |||
1812 | cqr->status = DASD_CQR_FILLED; | 1906 | cqr->status = DASD_CQR_FILLED; |
1813 | cqr->retries = 255; | 1907 | cqr->retries = 255; |
1814 | spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags); | 1908 | spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags); |
1815 | base->stopped |= DASD_STOPPED_QUIESCE; | 1909 | dasd_device_set_stop_bits(base, DASD_STOPPED_QUIESCE); |
1816 | spin_unlock_irqrestore(get_ccwdev_lock(base->cdev), | 1910 | spin_unlock_irqrestore(get_ccwdev_lock(base->cdev), |
1817 | flags); | 1911 | flags); |
1818 | goto restart; | 1912 | goto restart; |
@@ -1820,7 +1914,7 @@ restart: | |||
1820 | 1914 | ||
1821 | /* Process finished ERP request. */ | 1915 | /* Process finished ERP request. */ |
1822 | if (cqr->refers) { | 1916 | if (cqr->refers) { |
1823 | __dasd_block_process_erp(block, cqr); | 1917 | __dasd_process_erp(base, cqr); |
1824 | goto restart; | 1918 | goto restart; |
1825 | } | 1919 | } |
1826 | 1920 | ||
@@ -1951,7 +2045,7 @@ restart_cb: | |||
1951 | /* Process finished ERP request. */ | 2045 | /* Process finished ERP request. */ |
1952 | if (cqr->refers) { | 2046 | if (cqr->refers) { |
1953 | spin_lock_bh(&block->queue_lock); | 2047 | spin_lock_bh(&block->queue_lock); |
1954 | __dasd_block_process_erp(block, cqr); | 2048 | __dasd_process_erp(block->base, cqr); |
1955 | spin_unlock_bh(&block->queue_lock); | 2049 | spin_unlock_bh(&block->queue_lock); |
1956 | /* restart list_for_xx loop since dasd_process_erp | 2050 | /* restart list_for_xx loop since dasd_process_erp |
1957 | * might remove multiple elements */ | 2051 | * might remove multiple elements */ |
@@ -2410,16 +2504,16 @@ int dasd_generic_notify(struct ccw_device *cdev, int event) | |||
2410 | cqr->status = DASD_CQR_QUEUED; | 2504 | cqr->status = DASD_CQR_QUEUED; |
2411 | cqr->retries++; | 2505 | cqr->retries++; |
2412 | } | 2506 | } |
2413 | device->stopped |= DASD_STOPPED_DC_WAIT; | 2507 | dasd_device_set_stop_bits(device, DASD_STOPPED_DC_WAIT); |
2414 | dasd_device_clear_timer(device); | 2508 | dasd_device_clear_timer(device); |
2415 | dasd_schedule_device_bh(device); | 2509 | dasd_schedule_device_bh(device); |
2416 | ret = 1; | 2510 | ret = 1; |
2417 | break; | 2511 | break; |
2418 | case CIO_OPER: | 2512 | case CIO_OPER: |
2419 | /* FIXME: add a sanity check. */ | 2513 | /* FIXME: add a sanity check. */ |
2420 | device->stopped &= ~DASD_STOPPED_DC_WAIT; | 2514 | dasd_device_remove_stop_bits(device, DASD_STOPPED_DC_WAIT); |
2421 | if (device->stopped & DASD_UNRESUMED_PM) { | 2515 | if (device->stopped & DASD_UNRESUMED_PM) { |
2422 | device->stopped &= ~DASD_UNRESUMED_PM; | 2516 | dasd_device_remove_stop_bits(device, DASD_UNRESUMED_PM); |
2423 | dasd_restore_device(device); | 2517 | dasd_restore_device(device); |
2424 | ret = 1; | 2518 | ret = 1; |
2425 | break; | 2519 | break; |
@@ -2444,7 +2538,7 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev) | |||
2444 | if (IS_ERR(device)) | 2538 | if (IS_ERR(device)) |
2445 | return PTR_ERR(device); | 2539 | return PTR_ERR(device); |
2446 | /* disallow new I/O */ | 2540 | /* disallow new I/O */ |
2447 | device->stopped |= DASD_STOPPED_PM; | 2541 | dasd_device_set_stop_bits(device, DASD_STOPPED_PM); |
2448 | /* clear active requests */ | 2542 | /* clear active requests */ |
2449 | INIT_LIST_HEAD(&freeze_queue); | 2543 | INIT_LIST_HEAD(&freeze_queue); |
2450 | spin_lock_irq(get_ccwdev_lock(cdev)); | 2544 | spin_lock_irq(get_ccwdev_lock(cdev)); |
@@ -2496,14 +2590,18 @@ int dasd_generic_restore_device(struct ccw_device *cdev) | |||
2496 | return PTR_ERR(device); | 2590 | return PTR_ERR(device); |
2497 | 2591 | ||
2498 | /* allow new IO again */ | 2592 | /* allow new IO again */ |
2499 | device->stopped &= ~DASD_STOPPED_PM; | 2593 | dasd_device_remove_stop_bits(device, |
2500 | device->stopped &= ~DASD_UNRESUMED_PM; | 2594 | (DASD_STOPPED_PM | DASD_UNRESUMED_PM)); |
2501 | 2595 | ||
2502 | dasd_schedule_device_bh(device); | 2596 | dasd_schedule_device_bh(device); |
2503 | 2597 | ||
2504 | if (device->discipline->restore) | 2598 | /* |
2599 | * call discipline restore function | ||
2600 | * if device is stopped do nothing e.g. for disconnected devices | ||
2601 | */ | ||
2602 | if (device->discipline->restore && !(device->stopped)) | ||
2505 | rc = device->discipline->restore(device); | 2603 | rc = device->discipline->restore(device); |
2506 | if (rc) | 2604 | if (rc || device->stopped) |
2507 | /* | 2605 | /* |
2508 | * if the resume failed for the DASD we put it in | 2606 | * if the resume failed for the DASD we put it in |
2509 | * an UNRESUMED stop state | 2607 | * an UNRESUMED stop state |
@@ -2553,8 +2651,7 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, | |||
2553 | cqr->startdev = device; | 2651 | cqr->startdev = device; |
2554 | cqr->memdev = device; | 2652 | cqr->memdev = device; |
2555 | cqr->expires = 10*HZ; | 2653 | cqr->expires = 10*HZ; |
2556 | clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); | 2654 | cqr->retries = 256; |
2557 | cqr->retries = 2; | ||
2558 | cqr->buildclk = get_clock(); | 2655 | cqr->buildclk = get_clock(); |
2559 | cqr->status = DASD_CQR_FILLED; | 2656 | cqr->status = DASD_CQR_FILLED; |
2560 | return cqr; | 2657 | return cqr; |