aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/firewire
diff options
context:
space:
mode:
authorStefan Richter <stefanr@s5r6.in-berlin.de>2008-02-16 10:37:28 -0500
committerStefan Richter <stefanr@s5r6.in-berlin.de>2008-02-19 13:57:23 -0500
commit2e2705bdcb959372d54bf7f79dd9a555ec2adfb4 (patch)
tree18d08c5282e2cd6de52df89944bc874508d51fb6 /drivers/firewire
parente80de3704ac30ddb7f9a12447a2ecee32ccd7880 (diff)
firewire: fw-sbp2: (try to) avoid I/O errors during reconnect
While fw-sbp2 takes the necessary time to reconnect to a logical unit after bus reset, the SCSI core keeps sending new commands. They are all immediately completed with host busy status, and application clients or filesystems will break quickly. The SCSI device might even be taken offline: http://bugzilla.kernel.org/show_bug.cgi?id=9734 The only remedy seems to be to block the SCSI device until reconnect. Alas the SCSI core has no useful API to block only one logical unit i.e. the scsi_device, therefore we block the entire Scsi_Host. This currently corresponds to an SBP-2 target. In case of targets with multiple logical units, we need to satisfy the dependencies between logical units by carefully tracking the blocking state of the target and its units. We block all logical units of a target as soon as one of them needs to be blocked, and keep them blocked until all of them are ready to be unblocked. Furthermore, as the history of the old sbp2 driver has shown, the scsi_block_requests() API is a minefield with high potential of deadlocks. We therefore take extra measures to keep logical units unblocked during __scsi_add_device() and during shutdown. This avoids I/O errors during reconnect in many but alas not in all cases. There may still be errors after a re-login had to be performed. Also, some bridges have been seen to cease fetching management ORBs if I/O went on up until a bus reset. In these cases, all management ORBs time out after mgt_orb_timeout. The old sbp2 driver is less vulnerable or maybe not vulnerable to this, for as yet unknown reasons. Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Diffstat (limited to 'drivers/firewire')
-rw-r--r--drivers/firewire/fw-sbp2.c126
1 files changed, 122 insertions, 4 deletions
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index 6d10934c58f1..ea4811c45512 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -139,6 +139,7 @@ struct sbp2_logical_unit {
139 int generation; 139 int generation;
140 int retries; 140 int retries;
141 struct delayed_work work; 141 struct delayed_work work;
142 bool blocked;
142}; 143};
143 144
144/* 145/*
@@ -157,6 +158,9 @@ struct sbp2_target {
157 int address_high; 158 int address_high;
158 unsigned int workarounds; 159 unsigned int workarounds;
159 unsigned int mgt_orb_timeout; 160 unsigned int mgt_orb_timeout;
161
162 int dont_block; /* counter for each logical unit */
163 int blocked; /* ditto */
160}; 164};
161 165
162/* 166/*
@@ -646,6 +650,107 @@ static void sbp2_agent_reset_no_wait(struct sbp2_logical_unit *lu)
646 &z, sizeof(z), complete_agent_reset_write_no_wait, t); 650 &z, sizeof(z), complete_agent_reset_write_no_wait, t);
647} 651}
648 652
653static void sbp2_set_generation(struct sbp2_logical_unit *lu, int generation)
654{
655 struct fw_card *card = fw_device(lu->tgt->unit->device.parent)->card;
656 unsigned long flags;
657
658 /* serialize with comparisons of lu->generation and card->generation */
659 spin_lock_irqsave(&card->lock, flags);
660 lu->generation = generation;
661 spin_unlock_irqrestore(&card->lock, flags);
662}
663
664static inline void sbp2_allow_block(struct sbp2_logical_unit *lu)
665{
666 /*
667 * We may access dont_block without taking card->lock here:
668 * All callers of sbp2_allow_block() and all callers of sbp2_unblock()
669 * are currently serialized against each other.
670 * And a wrong result in sbp2_conditionally_block()'s access of
671 * dont_block is rather harmless, it simply misses its first chance.
672 */
673 --lu->tgt->dont_block;
674}
675
676/*
677 * Blocks lu->tgt if all of the following conditions are met:
678 * - Login, INQUIRY, and high-level SCSI setup of all of the target's
679 * logical units have been finished (indicated by dont_block == 0).
680 * - lu->generation is stale.
681 *
682 * Note, scsi_block_requests() must be called while holding card->lock,
683 * otherwise it might foil sbp2_[conditionally_]unblock()'s attempt to
684 * unblock the target.
685 */
686static void sbp2_conditionally_block(struct sbp2_logical_unit *lu)
687{
688 struct sbp2_target *tgt = lu->tgt;
689 struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
690 struct Scsi_Host *shost =
691 container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
692 unsigned long flags;
693
694 spin_lock_irqsave(&card->lock, flags);
695 if (!tgt->dont_block && !lu->blocked &&
696 lu->generation != card->generation) {
697 lu->blocked = true;
698 if (++tgt->blocked == 1) {
699 scsi_block_requests(shost);
700 fw_notify("blocked %s\n", lu->tgt->bus_id);
701 }
702 }
703 spin_unlock_irqrestore(&card->lock, flags);
704}
705
706/*
707 * Unblocks lu->tgt as soon as all its logical units can be unblocked.
708 * Note, it is harmless to run scsi_unblock_requests() outside the
709 * card->lock protected section. On the other hand, running it inside
710 * the section might clash with shost->host_lock.
711 */
712static void sbp2_conditionally_unblock(struct sbp2_logical_unit *lu)
713{
714 struct sbp2_target *tgt = lu->tgt;
715 struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
716 struct Scsi_Host *shost =
717 container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
718 unsigned long flags;
719 bool unblock = false;
720
721 spin_lock_irqsave(&card->lock, flags);
722 if (lu->blocked && lu->generation == card->generation) {
723 lu->blocked = false;
724 unblock = --tgt->blocked == 0;
725 }
726 spin_unlock_irqrestore(&card->lock, flags);
727
728 if (unblock) {
729 scsi_unblock_requests(shost);
730 fw_notify("unblocked %s\n", lu->tgt->bus_id);
731 }
732}
733
734/*
735 * Prevents future blocking of tgt and unblocks it.
736 * Note, it is harmless to run scsi_unblock_requests() outside the
737 * card->lock protected section. On the other hand, running it inside
738 * the section might clash with shost->host_lock.
739 */
740static void sbp2_unblock(struct sbp2_target *tgt)
741{
742 struct fw_card *card = fw_device(tgt->unit->device.parent)->card;
743 struct Scsi_Host *shost =
744 container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
745 unsigned long flags;
746
747 spin_lock_irqsave(&card->lock, flags);
748 ++tgt->dont_block;
749 spin_unlock_irqrestore(&card->lock, flags);
750
751 scsi_unblock_requests(shost);
752}
753
649static void sbp2_release_target(struct kref *kref) 754static void sbp2_release_target(struct kref *kref)
650{ 755{
651 struct sbp2_target *tgt = container_of(kref, struct sbp2_target, kref); 756 struct sbp2_target *tgt = container_of(kref, struct sbp2_target, kref);
@@ -653,6 +758,9 @@ static void sbp2_release_target(struct kref *kref)
653 struct Scsi_Host *shost = 758 struct Scsi_Host *shost =
654 container_of((void *)tgt, struct Scsi_Host, hostdata[0]); 759 container_of((void *)tgt, struct Scsi_Host, hostdata[0]);
655 760
761 /* prevent deadlocks */
762 sbp2_unblock(tgt);
763
656 list_for_each_entry_safe(lu, next, &tgt->lu_list, link) { 764 list_for_each_entry_safe(lu, next, &tgt->lu_list, link) {
657 if (lu->sdev) 765 if (lu->sdev)
658 scsi_remove_device(lu->sdev); 766 scsi_remove_device(lu->sdev);
@@ -717,17 +825,20 @@ static void sbp2_login(struct work_struct *work)
717 825
718 if (sbp2_send_management_orb(lu, node_id, generation, 826 if (sbp2_send_management_orb(lu, node_id, generation,
719 SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) { 827 SBP2_LOGIN_REQUEST, lu->lun, &response) < 0) {
720 if (lu->retries++ < 5) 828 if (lu->retries++ < 5) {
721 sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5)); 829 sbp2_queue_work(lu, DIV_ROUND_UP(HZ, 5));
722 else 830 } else {
723 fw_error("%s: failed to login to LUN %04x\n", 831 fw_error("%s: failed to login to LUN %04x\n",
724 tgt->bus_id, lu->lun); 832 tgt->bus_id, lu->lun);
833 /* Let any waiting I/O fail from now on. */
834 sbp2_unblock(lu->tgt);
835 }
725 goto out; 836 goto out;
726 } 837 }
727 838
728 lu->generation = generation;
729 tgt->node_id = node_id; 839 tgt->node_id = node_id;
730 tgt->address_high = local_node_id << 16; 840 tgt->address_high = local_node_id << 16;
841 sbp2_set_generation(lu, generation);
731 842
732 /* Get command block agent offset and login id. */ 843 /* Get command block agent offset and login id. */
733 lu->command_block_agent_address = 844 lu->command_block_agent_address =
@@ -749,6 +860,7 @@ static void sbp2_login(struct work_struct *work)
749 /* This was a re-login. */ 860 /* This was a re-login. */
750 if (lu->sdev) { 861 if (lu->sdev) {
751 sbp2_cancel_orbs(lu); 862 sbp2_cancel_orbs(lu);
863 sbp2_conditionally_unblock(lu);
752 goto out; 864 goto out;
753 } 865 }
754 866
@@ -785,6 +897,7 @@ static void sbp2_login(struct work_struct *work)
785 897
786 /* No error during __scsi_add_device() */ 898 /* No error during __scsi_add_device() */
787 lu->sdev = sdev; 899 lu->sdev = sdev;
900 sbp2_allow_block(lu);
788 goto out; 901 goto out;
789 902
790 out_logout_login: 903 out_logout_login:
@@ -825,6 +938,8 @@ static int sbp2_add_logical_unit(struct sbp2_target *tgt, int lun_entry)
825 lu->sdev = NULL; 938 lu->sdev = NULL;
826 lu->lun = lun_entry & 0xffff; 939 lu->lun = lun_entry & 0xffff;
827 lu->retries = 0; 940 lu->retries = 0;
941 lu->blocked = false;
942 ++tgt->dont_block;
828 INIT_LIST_HEAD(&lu->orb_list); 943 INIT_LIST_HEAD(&lu->orb_list);
829 INIT_DELAYED_WORK(&lu->work, sbp2_login); 944 INIT_DELAYED_WORK(&lu->work, sbp2_login);
830 945
@@ -1041,15 +1156,16 @@ static void sbp2_reconnect(struct work_struct *work)
1041 goto out; 1156 goto out;
1042 } 1157 }
1043 1158
1044 lu->generation = generation;
1045 tgt->node_id = node_id; 1159 tgt->node_id = node_id;
1046 tgt->address_high = local_node_id << 16; 1160 tgt->address_high = local_node_id << 16;
1161 sbp2_set_generation(lu, generation);
1047 1162
1048 fw_notify("%s: reconnected to LUN %04x (%d retries)\n", 1163 fw_notify("%s: reconnected to LUN %04x (%d retries)\n",
1049 tgt->bus_id, lu->lun, lu->retries); 1164 tgt->bus_id, lu->lun, lu->retries);
1050 1165
1051 sbp2_agent_reset(lu); 1166 sbp2_agent_reset(lu);
1052 sbp2_cancel_orbs(lu); 1167 sbp2_cancel_orbs(lu);
1168 sbp2_conditionally_unblock(lu);
1053 out: 1169 out:
1054 sbp2_target_put(tgt); 1170 sbp2_target_put(tgt);
1055} 1171}
@@ -1066,6 +1182,7 @@ static void sbp2_update(struct fw_unit *unit)
1066 * Iteration over tgt->lu_list is therefore safe here. 1182 * Iteration over tgt->lu_list is therefore safe here.
1067 */ 1183 */
1068 list_for_each_entry(lu, &tgt->lu_list, link) { 1184 list_for_each_entry(lu, &tgt->lu_list, link) {
1185 sbp2_conditionally_block(lu);
1069 lu->retries = 0; 1186 lu->retries = 0;
1070 sbp2_queue_work(lu, 0); 1187 sbp2_queue_work(lu, 0);
1071 } 1188 }
@@ -1169,6 +1286,7 @@ complete_command_orb(struct sbp2_orb *base_orb, struct sbp2_status *status)
1169 * or when sending the write (less likely). 1286 * or when sending the write (less likely).
1170 */ 1287 */
1171 result = DID_BUS_BUSY << 16; 1288 result = DID_BUS_BUSY << 16;
1289 sbp2_conditionally_block(orb->lu);
1172 } 1290 }
1173 1291
1174 dma_unmap_single(device->card->device, orb->base.request_bus, 1292 dma_unmap_single(device->card->device, orb->base.request_bus,