aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStefan Richter <stefanr@s5r6.in-berlin.de>2010-08-16 16:13:34 -0400
committerStefan Richter <stefanr@s5r6.in-berlin.de>2010-08-19 14:28:25 -0400
commita481e97d3cdc40b9d58271675bd4f0abb79d4872 (patch)
treec860d626fbee5f1fb2bc9511fa7fb3c586801c56
parent6c74340bce253ea95c9ee801b3c411a333937edf (diff)
firewire: sbp2: fix stall with "Unsolicited response"
Fix I/O stalls with some 4-bay RAID enclosures which are based on OXUF936QSE: - Onnto dataTale RSM4QO, old firmware (not anymore with current firmware), - inXtron Hydra Super-S LCM, old as well as current firmware when used in RAID-5 mode, perhaps also in other RAID modes. The stalls happen during heavy or moderate disk traffic in periods that are a multiple of 5 minutes, roughly twice per hour. They are caused by the target responding too late to an ORB_Pointer register write: The target responds after Split_Timeout, hence firewire-core cancels the transaction, and firewire-sbp2 fails the SCSI request. The SCSI core retries the request, that fails again (and again), hence SCSI core calls firewire-sbp2's abort handler (and even the Management_Agent register write in the abort handler has the transaction timeout problem). During all that, the process which issued the I/O is stalled in I/O wait state. Meanwhile, the target actually acts on the first failed SCSI request: It responds to the ORB_Pointer write later (seen in the kernel log as "firewire_core: Unsolicited response") and also finishes the SCSI request with proper status (seen in the kernel log as "firewire_sbp2: status write for unknown orb"). So let's just ignore RCODE_CANCELLED in the transaction callback and wait for the target to complete the ORB nevertheless. This requires a small modification is sbp2_cancel_orbs(); it now needs to call orb->callback() regardless whether fw_cancel_transaction() found the transaction unfinished or finished. A different solution is to increase Split_Timeout on the local node. (Tested: 2000ms timeout; maybe 1000ms or something like that works too. 200ms is insufficient. Standard is 100ms.) However, I rather not do this because any software on any node could change the Split_Timeout to something unsuitable. Or such a large Split_Timeout may be undesirable for other purposes. Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
-rw-r--r--drivers/firewire/sbp2.c11
1 files changed, 8 insertions, 3 deletions
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index e6cbe491f7ee..bfae4b309791 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -472,12 +472,18 @@ static void complete_transaction(struct fw_card *card, int rcode,
472 * So this callback only sets the rcode if it hasn't already 472 * So this callback only sets the rcode if it hasn't already
473 * been set and only does the cleanup if the transaction 473 * been set and only does the cleanup if the transaction
474 * failed and we didn't already get a status write. 474 * failed and we didn't already get a status write.
475 *
476 * Here we treat RCODE_CANCELLED like RCODE_COMPLETE because some
477 * OXUF936QSE firmwares occasionally respond after Split_Timeout and
478 * complete the ORB just fine. Note, we also get RCODE_CANCELLED
479 * from sbp2_cancel_orbs() if fw_cancel_transaction() == 0.
475 */ 480 */
476 spin_lock_irqsave(&card->lock, flags); 481 spin_lock_irqsave(&card->lock, flags);
477 482
478 if (orb->rcode == -1) 483 if (orb->rcode == -1)
479 orb->rcode = rcode; 484 orb->rcode = rcode;
480 if (orb->rcode != RCODE_COMPLETE) { 485
486 if (orb->rcode != RCODE_COMPLETE && orb->rcode != RCODE_CANCELLED) {
481 list_del(&orb->link); 487 list_del(&orb->link);
482 spin_unlock_irqrestore(&card->lock, flags); 488 spin_unlock_irqrestore(&card->lock, flags);
483 489
@@ -526,8 +532,7 @@ static int sbp2_cancel_orbs(struct sbp2_logical_unit *lu)
526 532
527 list_for_each_entry_safe(orb, next, &list, link) { 533 list_for_each_entry_safe(orb, next, &list, link) {
528 retval = 0; 534 retval = 0;
529 if (fw_cancel_transaction(device->card, &orb->t) == 0) 535 fw_cancel_transaction(device->card, &orb->t);
530 continue;
531 536
532 orb->rcode = RCODE_CANCELLED; 537 orb->rcode = RCODE_CANCELLED;
533 orb->callback(orb, NULL); 538 orb->callback(orb, NULL);