aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/scsi
diff options
context:
space:
mode:
authorJames Smart <James.Smart@Emulex.Com>2006-05-11 13:27:09 -0400
committerJames Bottomley <jejb@mulgrave.il.steeleye.com>2006-06-27 11:52:49 -0400
commita0785edff7b316cad566bdc369d3d034c4c1a39c (patch)
tree5a0b143118c2bb5bb1f9bc8938d8568f7215f562 /drivers/scsi
parent79ac6745e4d95cd583bca744d313a323deb4adc2 (diff)
[SCSI] fc transport: resolve scan vs delete deadlocks
In a prior posting to linux-scsi on the fc transport and workq deadlocks, we noted a second error that did not have a patch: http://marc.theaimsgroup.com/?l=linux-scsi&m=114467847711383&w=2 - There's a deadlock where scsi_remove_target() has to sit behind scsi_scan_target() due to contention over the scan_lock(). Subsequently we posted a request for comments about the deadlock: http://marc.theaimsgroup.com/?l=linux-scsi&m=114469358829500&w=2 This posting resolves the second error. Here's what we now understand, and are implementing: If the lldd deletes the rport while a scan is active, the sdev's queue is blocked which stops the issuing of commands associated with the scan. At this point, the scan stalls, and does so with the shost->scan_mutex held. If, at this point, if any scan or delete request is made on the host, it will stall waiting for the scan_mutex. For the FC transport, we queue all delete work to a single workq. So, things worked fine when competing with the scan, as long as the target blocking the scan was the same target at the top of our delete workq, as the delete workq routine always unblocked just prior to requesting the delete. Unfortunately, if the top of our delete workq was for a different target, we deadlock. Additionally, if the target blocking scan returned, we were unblocking it in the scan workq routine, which really won't execute until the existing stalled scan workq completes (e.g. we're re-scheduling it while it is in the midst of its execution). This patch moves the unblock out of the workq routines and moves it to the context that is scheduling the work. This ensures that at some point, we will unblock the target that is blocking scan. Please note, however, that the deadlock condition may still occur while it waits for the transport to timeout an unblock on a target. Worst case, this is bounded by the transport dev_loss_tmo (default: 30 seconds). Finally, Michael Reed deserves the credit for the bulk of this patch, analysis, and it's testing. Thank you for your help. Note: The request for comments statements about the gross-ness of the scan_mutex still stand. Signed-off-by: Michael Reed <mdr@sgi.com> Signed-off-by: James Smart <james.smart@emulex.com> Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Diffstat (limited to 'drivers/scsi')
-rw-r--r--drivers/scsi/scsi_transport_fc.c28
1 files changed, 15 insertions, 13 deletions
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index f2db7a41cf1d..c76e73a3ffbe 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -1284,7 +1284,9 @@ EXPORT_SYMBOL(fc_release_transport);
1284 * @work: Work to queue for execution. 1284 * @work: Work to queue for execution.
1285 * 1285 *
1286 * Return value: 1286 * Return value:
1287 * 0 on success / != 0 for error 1287 * 1 - work queued for execution
1288 * 0 - work is already queued
1289 * -EINVAL - work queue doesn't exist
1288 **/ 1290 **/
1289static int 1291static int
1290fc_queue_work(struct Scsi_Host *shost, struct work_struct *work) 1292fc_queue_work(struct Scsi_Host *shost, struct work_struct *work)
@@ -1434,8 +1436,6 @@ fc_starget_delete(void *data)
1434 struct Scsi_Host *shost = rport_to_shost(rport); 1436 struct Scsi_Host *shost = rport_to_shost(rport);
1435 unsigned long flags; 1437 unsigned long flags;
1436 1438
1437 scsi_target_unblock(&rport->dev);
1438
1439 spin_lock_irqsave(shost->host_lock, flags); 1439 spin_lock_irqsave(shost->host_lock, flags);
1440 if (rport->flags & FC_RPORT_DEVLOSS_PENDING) { 1440 if (rport->flags & FC_RPORT_DEVLOSS_PENDING) {
1441 spin_unlock_irqrestore(shost->host_lock, flags); 1441 spin_unlock_irqrestore(shost->host_lock, flags);
@@ -1707,6 +1707,8 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel,
1707 1707
1708 spin_unlock_irqrestore(shost->host_lock, flags); 1708 spin_unlock_irqrestore(shost->host_lock, flags);
1709 1709
1710 scsi_target_unblock(&rport->dev);
1711
1710 return rport; 1712 return rport;
1711 } 1713 }
1712 } 1714 }
@@ -1762,9 +1764,10 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel,
1762 /* initiate a scan of the target */ 1764 /* initiate a scan of the target */
1763 rport->flags |= FC_RPORT_SCAN_PENDING; 1765 rport->flags |= FC_RPORT_SCAN_PENDING;
1764 scsi_queue_work(shost, &rport->scan_work); 1766 scsi_queue_work(shost, &rport->scan_work);
1765 } 1767 spin_unlock_irqrestore(shost->host_lock, flags);
1766 1768 scsi_target_unblock(&rport->dev);
1767 spin_unlock_irqrestore(shost->host_lock, flags); 1769 } else
1770 spin_unlock_irqrestore(shost->host_lock, flags);
1768 1771
1769 return rport; 1772 return rport;
1770 } 1773 }
@@ -1938,6 +1941,7 @@ fc_remote_port_rolechg(struct fc_rport *rport, u32 roles)
1938 rport->flags |= FC_RPORT_SCAN_PENDING; 1941 rport->flags |= FC_RPORT_SCAN_PENDING;
1939 scsi_queue_work(shost, &rport->scan_work); 1942 scsi_queue_work(shost, &rport->scan_work);
1940 spin_unlock_irqrestore(shost->host_lock, flags); 1943 spin_unlock_irqrestore(shost->host_lock, flags);
1944 scsi_target_unblock(&rport->dev);
1941 } 1945 }
1942} 1946}
1943EXPORT_SYMBOL(fc_remote_port_rolechg); 1947EXPORT_SYMBOL(fc_remote_port_rolechg);
@@ -1970,8 +1974,9 @@ fc_timeout_deleted_rport(void *data)
1970 dev_printk(KERN_ERR, &rport->dev, 1974 dev_printk(KERN_ERR, &rport->dev,
1971 "blocked FC remote port time out: no longer" 1975 "blocked FC remote port time out: no longer"
1972 " a FCP target, removing starget\n"); 1976 " a FCP target, removing starget\n");
1973 fc_queue_work(shost, &rport->stgt_delete_work);
1974 spin_unlock_irqrestore(shost->host_lock, flags); 1977 spin_unlock_irqrestore(shost->host_lock, flags);
1978 scsi_target_unblock(&rport->dev);
1979 fc_queue_work(shost, &rport->stgt_delete_work);
1975 return; 1980 return;
1976 } 1981 }
1977 1982
@@ -2035,17 +2040,15 @@ fc_timeout_deleted_rport(void *data)
2035 * went away and didn't come back - we'll remove 2040 * went away and didn't come back - we'll remove
2036 * all attached scsi devices. 2041 * all attached scsi devices.
2037 */ 2042 */
2038 fc_queue_work(shost, &rport->stgt_delete_work);
2039
2040 spin_unlock_irqrestore(shost->host_lock, flags); 2043 spin_unlock_irqrestore(shost->host_lock, flags);
2044
2045 scsi_target_unblock(&rport->dev);
2046 fc_queue_work(shost, &rport->stgt_delete_work);
2041} 2047}
2042 2048
2043/** 2049/**
2044 * fc_scsi_scan_rport - called to perform a scsi scan on a remote port. 2050 * fc_scsi_scan_rport - called to perform a scsi scan on a remote port.
2045 * 2051 *
2046 * Will unblock the target (in case it went away and has now come back),
2047 * then invoke a scan.
2048 *
2049 * @data: remote port to be scanned. 2052 * @data: remote port to be scanned.
2050 **/ 2053 **/
2051static void 2054static void
@@ -2057,7 +2060,6 @@ fc_scsi_scan_rport(void *data)
2057 2060
2058 if ((rport->port_state == FC_PORTSTATE_ONLINE) && 2061 if ((rport->port_state == FC_PORTSTATE_ONLINE) &&
2059 (rport->roles & FC_RPORT_ROLE_FCP_TARGET)) { 2062 (rport->roles & FC_RPORT_ROLE_FCP_TARGET)) {
2060 scsi_target_unblock(&rport->dev);
2061 scsi_scan_target(&rport->dev, rport->channel, 2063 scsi_scan_target(&rport->dev, rport->channel,
2062 rport->scsi_target_id, SCAN_WILD_CARD, 1); 2064 rport->scsi_target_id, SCAN_WILD_CARD, 1);
2063 } 2065 }