aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/scsi
diff options
context:
space:
mode:
authorStephen M. Cameron <scameron@beardog.cce.hp.com>2011-10-26 17:22:04 -0400
committerJames Bottomley <JBottomley@Parallels.com>2011-10-30 06:35:01 -0400
commita0c124137a40fc22730ae87caf17e821f2dce1ed (patch)
tree93f2b17ee632867a34236d2b4f70dd88a65cac10 /drivers/scsi
parentbb158eabda984851d7964d968b9859383f98a701 (diff)
[SCSI] hpsa: detect controller lockup
When controller lockup condition is detected, we should fail all outstanding commands and disable the controller. This will enable multipath solutions to recover gracefully. Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Diffstat (limited to 'drivers/scsi')
-rw-r--r--drivers/scsi/hpsa.c184
-rw-r--r--drivers/scsi/hpsa.h5
2 files changed, 185 insertions, 4 deletions
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 57ed00f7050a..e0119377ffe3 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -48,6 +48,7 @@
48#include <linux/bitmap.h> 48#include <linux/bitmap.h>
49#include <linux/atomic.h> 49#include <linux/atomic.h>
50#include <linux/kthread.h> 50#include <linux/kthread.h>
51#include <linux/jiffies.h>
51#include "hpsa_cmd.h" 52#include "hpsa_cmd.h"
52#include "hpsa.h" 53#include "hpsa.h"
53 54
@@ -127,6 +128,10 @@ static struct board_type products[] = {
127 128
128static int number_of_controllers; 129static int number_of_controllers;
129 130
131static struct list_head hpsa_ctlr_list = LIST_HEAD_INIT(hpsa_ctlr_list);
132static spinlock_t lockup_detector_lock;
133static struct task_struct *hpsa_lockup_detector;
134
130static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id); 135static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id);
131static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id); 136static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id);
132static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg); 137static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg);
@@ -1337,6 +1342,22 @@ static inline void hpsa_scsi_do_simple_cmd_core(struct ctlr_info *h,
1337 wait_for_completion(&wait); 1342 wait_for_completion(&wait);
1338} 1343}
1339 1344
1345static void hpsa_scsi_do_simple_cmd_core_if_no_lockup(struct ctlr_info *h,
1346 struct CommandList *c)
1347{
1348 unsigned long flags;
1349
1350 /* If controller lockup detected, fake a hardware error. */
1351 spin_lock_irqsave(&h->lock, flags);
1352 if (unlikely(h->lockup_detected)) {
1353 spin_unlock_irqrestore(&h->lock, flags);
1354 c->err_info->CommandStatus = CMD_HARDWARE_ERR;
1355 } else {
1356 spin_unlock_irqrestore(&h->lock, flags);
1357 hpsa_scsi_do_simple_cmd_core(h, c);
1358 }
1359}
1360
1340static void hpsa_scsi_do_simple_cmd_with_retry(struct ctlr_info *h, 1361static void hpsa_scsi_do_simple_cmd_with_retry(struct ctlr_info *h,
1341 struct CommandList *c, int data_direction) 1362 struct CommandList *c, int data_direction)
1342{ 1363{
@@ -2052,8 +2073,14 @@ static int hpsa_scsi_queue_command_lck(struct scsi_cmnd *cmd,
2052 } 2073 }
2053 memcpy(scsi3addr, dev->scsi3addr, sizeof(scsi3addr)); 2074 memcpy(scsi3addr, dev->scsi3addr, sizeof(scsi3addr));
2054 2075
2055 /* Need a lock as this is being allocated from the pool */
2056 spin_lock_irqsave(&h->lock, flags); 2076 spin_lock_irqsave(&h->lock, flags);
2077 if (unlikely(h->lockup_detected)) {
2078 spin_unlock_irqrestore(&h->lock, flags);
2079 cmd->result = DID_ERROR << 16;
2080 done(cmd);
2081 return 0;
2082 }
2083 /* Need a lock as this is being allocated from the pool */
2057 c = cmd_alloc(h); 2084 c = cmd_alloc(h);
2058 spin_unlock_irqrestore(&h->lock, flags); 2085 spin_unlock_irqrestore(&h->lock, flags);
2059 if (c == NULL) { /* trouble... */ 2086 if (c == NULL) { /* trouble... */
@@ -2605,7 +2632,7 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h, void __user *argp)
2605 c->SG[0].Len = iocommand.buf_size; 2632 c->SG[0].Len = iocommand.buf_size;
2606 c->SG[0].Ext = 0; /* we are not chaining*/ 2633 c->SG[0].Ext = 0; /* we are not chaining*/
2607 } 2634 }
2608 hpsa_scsi_do_simple_cmd_core(h, c); 2635 hpsa_scsi_do_simple_cmd_core_if_no_lockup(h, c);
2609 if (iocommand.buf_size > 0) 2636 if (iocommand.buf_size > 0)
2610 hpsa_pci_unmap(h->pdev, c, 1, PCI_DMA_BIDIRECTIONAL); 2637 hpsa_pci_unmap(h->pdev, c, 1, PCI_DMA_BIDIRECTIONAL);
2611 check_ioctl_unit_attention(h, c); 2638 check_ioctl_unit_attention(h, c);
@@ -2728,7 +2755,7 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h, void __user *argp)
2728 c->SG[i].Ext = 0; 2755 c->SG[i].Ext = 0;
2729 } 2756 }
2730 } 2757 }
2731 hpsa_scsi_do_simple_cmd_core(h, c); 2758 hpsa_scsi_do_simple_cmd_core_if_no_lockup(h, c);
2732 if (sg_used) 2759 if (sg_used)
2733 hpsa_pci_unmap(h->pdev, c, sg_used, PCI_DMA_BIDIRECTIONAL); 2760 hpsa_pci_unmap(h->pdev, c, sg_used, PCI_DMA_BIDIRECTIONAL);
2734 check_ioctl_unit_attention(h, c); 2761 check_ioctl_unit_attention(h, c);
@@ -3097,6 +3124,7 @@ static irqreturn_t hpsa_intx_discard_completions(int irq, void *dev_id)
3097 if (interrupt_not_for_us(h)) 3124 if (interrupt_not_for_us(h))
3098 return IRQ_NONE; 3125 return IRQ_NONE;
3099 spin_lock_irqsave(&h->lock, flags); 3126 spin_lock_irqsave(&h->lock, flags);
3127 h->last_intr_timestamp = get_jiffies_64();
3100 while (interrupt_pending(h)) { 3128 while (interrupt_pending(h)) {
3101 raw_tag = get_next_completion(h); 3129 raw_tag = get_next_completion(h);
3102 while (raw_tag != FIFO_EMPTY) 3130 while (raw_tag != FIFO_EMPTY)
@@ -3116,6 +3144,7 @@ static irqreturn_t hpsa_msix_discard_completions(int irq, void *dev_id)
3116 return IRQ_NONE; 3144 return IRQ_NONE;
3117 3145
3118 spin_lock_irqsave(&h->lock, flags); 3146 spin_lock_irqsave(&h->lock, flags);
3147 h->last_intr_timestamp = get_jiffies_64();
3119 raw_tag = get_next_completion(h); 3148 raw_tag = get_next_completion(h);
3120 while (raw_tag != FIFO_EMPTY) 3149 while (raw_tag != FIFO_EMPTY)
3121 raw_tag = next_command(h); 3150 raw_tag = next_command(h);
@@ -3132,6 +3161,7 @@ static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id)
3132 if (interrupt_not_for_us(h)) 3161 if (interrupt_not_for_us(h))
3133 return IRQ_NONE; 3162 return IRQ_NONE;
3134 spin_lock_irqsave(&h->lock, flags); 3163 spin_lock_irqsave(&h->lock, flags);
3164 h->last_intr_timestamp = get_jiffies_64();
3135 while (interrupt_pending(h)) { 3165 while (interrupt_pending(h)) {
3136 raw_tag = get_next_completion(h); 3166 raw_tag = get_next_completion(h);
3137 while (raw_tag != FIFO_EMPTY) { 3167 while (raw_tag != FIFO_EMPTY) {
@@ -3152,6 +3182,7 @@ static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id)
3152 u32 raw_tag; 3182 u32 raw_tag;
3153 3183
3154 spin_lock_irqsave(&h->lock, flags); 3184 spin_lock_irqsave(&h->lock, flags);
3185 h->last_intr_timestamp = get_jiffies_64();
3155 raw_tag = get_next_completion(h); 3186 raw_tag = get_next_completion(h);
3156 while (raw_tag != FIFO_EMPTY) { 3187 while (raw_tag != FIFO_EMPTY) {
3157 if (hpsa_tag_contains_index(raw_tag)) 3188 if (hpsa_tag_contains_index(raw_tag))
@@ -4089,6 +4120,149 @@ static void hpsa_undo_allocations_after_kdump_soft_reset(struct ctlr_info *h)
4089 kfree(h); 4120 kfree(h);
4090} 4121}
4091 4122
4123static void remove_ctlr_from_lockup_detector_list(struct ctlr_info *h)
4124{
4125 assert_spin_locked(&lockup_detector_lock);
4126 if (!hpsa_lockup_detector)
4127 return;
4128 if (h->lockup_detected)
4129 return; /* already stopped the lockup detector */
4130 list_del(&h->lockup_list);
4131}
4132
4133/* Called when controller lockup detected. */
4134static void fail_all_cmds_on_list(struct ctlr_info *h, struct list_head *list)
4135{
4136 struct CommandList *c = NULL;
4137
4138 assert_spin_locked(&h->lock);
4139 /* Mark all outstanding commands as failed and complete them. */
4140 while (!list_empty(list)) {
4141 c = list_entry(list->next, struct CommandList, list);
4142 c->err_info->CommandStatus = CMD_HARDWARE_ERR;
4143 finish_cmd(c, c->Header.Tag.lower);
4144 }
4145}
4146
4147static void controller_lockup_detected(struct ctlr_info *h)
4148{
4149 unsigned long flags;
4150
4151 assert_spin_locked(&lockup_detector_lock);
4152 remove_ctlr_from_lockup_detector_list(h);
4153 h->access.set_intr_mask(h, HPSA_INTR_OFF);
4154 spin_lock_irqsave(&h->lock, flags);
4155 h->lockup_detected = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET);
4156 spin_unlock_irqrestore(&h->lock, flags);
4157 dev_warn(&h->pdev->dev, "Controller lockup detected: 0x%08x\n",
4158 h->lockup_detected);
4159 pci_disable_device(h->pdev);
4160 spin_lock_irqsave(&h->lock, flags);
4161 fail_all_cmds_on_list(h, &h->cmpQ);
4162 fail_all_cmds_on_list(h, &h->reqQ);
4163 spin_unlock_irqrestore(&h->lock, flags);
4164}
4165
4166#define HEARTBEAT_SAMPLE_INTERVAL (10 * HZ)
4167#define HEARTBEAT_CHECK_MINIMUM_INTERVAL (HEARTBEAT_SAMPLE_INTERVAL / 2)
4168
4169static void detect_controller_lockup(struct ctlr_info *h)
4170{
4171 u64 now;
4172 u32 heartbeat;
4173 unsigned long flags;
4174
4175 assert_spin_locked(&lockup_detector_lock);
4176 now = get_jiffies_64();
4177 /* If we've received an interrupt recently, we're ok. */
4178 if (time_after64(h->last_intr_timestamp +
4179 (HEARTBEAT_CHECK_MINIMUM_INTERVAL), now))
4180 return;
4181
4182 /*
4183 * If we've already checked the heartbeat recently, we're ok.
4184 * This could happen if someone sends us a signal. We
4185 * otherwise don't care about signals in this thread.
4186 */
4187 if (time_after64(h->last_heartbeat_timestamp +
4188 (HEARTBEAT_CHECK_MINIMUM_INTERVAL), now))
4189 return;
4190
4191 /* If heartbeat has not changed since we last looked, we're not ok. */
4192 spin_lock_irqsave(&h->lock, flags);
4193 heartbeat = readl(&h->cfgtable->HeartBeat);
4194 spin_unlock_irqrestore(&h->lock, flags);
4195 if (h->last_heartbeat == heartbeat) {
4196 controller_lockup_detected(h);
4197 return;
4198 }
4199
4200 /* We're ok. */
4201 h->last_heartbeat = heartbeat;
4202 h->last_heartbeat_timestamp = now;
4203}
4204
4205static int detect_controller_lockup_thread(void *notused)
4206{
4207 struct ctlr_info *h;
4208 unsigned long flags;
4209
4210 while (1) {
4211 struct list_head *this, *tmp;
4212
4213 schedule_timeout_interruptible(HEARTBEAT_SAMPLE_INTERVAL);
4214 if (kthread_should_stop())
4215 break;
4216 spin_lock_irqsave(&lockup_detector_lock, flags);
4217 list_for_each_safe(this, tmp, &hpsa_ctlr_list) {
4218 h = list_entry(this, struct ctlr_info, lockup_list);
4219 detect_controller_lockup(h);
4220 }
4221 spin_unlock_irqrestore(&lockup_detector_lock, flags);
4222 }
4223 return 0;
4224}
4225
4226static void add_ctlr_to_lockup_detector_list(struct ctlr_info *h)
4227{
4228 unsigned long flags;
4229
4230 spin_lock_irqsave(&lockup_detector_lock, flags);
4231 list_add_tail(&h->lockup_list, &hpsa_ctlr_list);
4232 spin_unlock_irqrestore(&lockup_detector_lock, flags);
4233}
4234
4235static void start_controller_lockup_detector(struct ctlr_info *h)
4236{
4237 /* Start the lockup detector thread if not already started */
4238 if (!hpsa_lockup_detector) {
4239 spin_lock_init(&lockup_detector_lock);
4240 hpsa_lockup_detector =
4241 kthread_run(detect_controller_lockup_thread,
4242 NULL, "hpsa");
4243 }
4244 if (!hpsa_lockup_detector) {
4245 dev_warn(&h->pdev->dev,
4246 "Could not start lockup detector thread\n");
4247 return;
4248 }
4249 add_ctlr_to_lockup_detector_list(h);
4250}
4251
4252static void stop_controller_lockup_detector(struct ctlr_info *h)
4253{
4254 unsigned long flags;
4255
4256 spin_lock_irqsave(&lockup_detector_lock, flags);
4257 remove_ctlr_from_lockup_detector_list(h);
4258 /* If the list of ctlr's to monitor is empty, stop the thread */
4259 if (list_empty(&hpsa_ctlr_list)) {
4260 kthread_stop(hpsa_lockup_detector);
4261 hpsa_lockup_detector = NULL;
4262 }
4263 spin_unlock_irqrestore(&lockup_detector_lock, flags);
4264}
4265
4092static int __devinit hpsa_init_one(struct pci_dev *pdev, 4266static int __devinit hpsa_init_one(struct pci_dev *pdev,
4093 const struct pci_device_id *ent) 4267 const struct pci_device_id *ent)
4094{ 4268{
@@ -4234,6 +4408,7 @@ reinit_after_soft_reset:
4234 4408
4235 hpsa_hba_inquiry(h); 4409 hpsa_hba_inquiry(h);
4236 hpsa_register_scsi(h); /* hook ourselves into SCSI subsystem */ 4410 hpsa_register_scsi(h); /* hook ourselves into SCSI subsystem */
4411 start_controller_lockup_detector(h);
4237 return 1; 4412 return 1;
4238 4413
4239clean4: 4414clean4:
@@ -4296,10 +4471,11 @@ static void __devexit hpsa_remove_one(struct pci_dev *pdev)
4296 struct ctlr_info *h; 4471 struct ctlr_info *h;
4297 4472
4298 if (pci_get_drvdata(pdev) == NULL) { 4473 if (pci_get_drvdata(pdev) == NULL) {
4299 dev_err(&pdev->dev, "unable to remove device \n"); 4474 dev_err(&pdev->dev, "unable to remove device\n");
4300 return; 4475 return;
4301 } 4476 }
4302 h = pci_get_drvdata(pdev); 4477 h = pci_get_drvdata(pdev);
4478 stop_controller_lockup_detector(h);
4303 hpsa_unregister_scsi(h); /* unhook from SCSI subsystem */ 4479 hpsa_unregister_scsi(h); /* unhook from SCSI subsystem */
4304 hpsa_shutdown(pdev); 4480 hpsa_shutdown(pdev);
4305 iounmap(h->vaddr); 4481 iounmap(h->vaddr);
diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h
index 73858bc22e57..91edafb8c7e6 100644
--- a/drivers/scsi/hpsa.h
+++ b/drivers/scsi/hpsa.h
@@ -121,6 +121,11 @@ struct ctlr_info {
121 unsigned char reply_pool_wraparound; 121 unsigned char reply_pool_wraparound;
122 u32 *blockFetchTable; 122 u32 *blockFetchTable;
123 unsigned char *hba_inquiry_data; 123 unsigned char *hba_inquiry_data;
124 u64 last_intr_timestamp;
125 u32 last_heartbeat;
126 u64 last_heartbeat_timestamp;
127 u32 lockup_detected;
128 struct list_head lockup_list;
124}; 129};
125#define HPSA_ABORT_MSG 0 130#define HPSA_ABORT_MSG 0
126#define HPSA_DEVICE_RESET_MSG 1 131#define HPSA_DEVICE_RESET_MSG 1