diff options
author | Stephen M. Cameron <scameron@beardog.cce.hp.com> | 2011-10-26 17:22:04 -0400 |
---|---|---|
committer | James Bottomley <JBottomley@Parallels.com> | 2011-10-30 06:35:01 -0400 |
commit | a0c124137a40fc22730ae87caf17e821f2dce1ed (patch) | |
tree | 93f2b17ee632867a34236d2b4f70dd88a65cac10 /drivers/scsi | |
parent | bb158eabda984851d7964d968b9859383f98a701 (diff) |
[SCSI] hpsa: detect controller lockup
When controller lockup condition is detected,
we should fail all outstanding commands and disable
the controller. This will enable multipath solutions
to recover gracefully.
Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Diffstat (limited to 'drivers/scsi')
-rw-r--r-- | drivers/scsi/hpsa.c | 184 | ||||
-rw-r--r-- | drivers/scsi/hpsa.h | 5 |
2 files changed, 185 insertions, 4 deletions
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 57ed00f7050a..e0119377ffe3 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/bitmap.h> | 48 | #include <linux/bitmap.h> |
49 | #include <linux/atomic.h> | 49 | #include <linux/atomic.h> |
50 | #include <linux/kthread.h> | 50 | #include <linux/kthread.h> |
51 | #include <linux/jiffies.h> | ||
51 | #include "hpsa_cmd.h" | 52 | #include "hpsa_cmd.h" |
52 | #include "hpsa.h" | 53 | #include "hpsa.h" |
53 | 54 | ||
@@ -127,6 +128,10 @@ static struct board_type products[] = { | |||
127 | 128 | ||
128 | static int number_of_controllers; | 129 | static int number_of_controllers; |
129 | 130 | ||
131 | static struct list_head hpsa_ctlr_list = LIST_HEAD_INIT(hpsa_ctlr_list); | ||
132 | static spinlock_t lockup_detector_lock; | ||
133 | static struct task_struct *hpsa_lockup_detector; | ||
134 | |||
130 | static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id); | 135 | static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id); |
131 | static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id); | 136 | static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id); |
132 | static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg); | 137 | static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg); |
@@ -1337,6 +1342,22 @@ static inline void hpsa_scsi_do_simple_cmd_core(struct ctlr_info *h, | |||
1337 | wait_for_completion(&wait); | 1342 | wait_for_completion(&wait); |
1338 | } | 1343 | } |
1339 | 1344 | ||
1345 | static void hpsa_scsi_do_simple_cmd_core_if_no_lockup(struct ctlr_info *h, | ||
1346 | struct CommandList *c) | ||
1347 | { | ||
1348 | unsigned long flags; | ||
1349 | |||
1350 | /* If controller lockup detected, fake a hardware error. */ | ||
1351 | spin_lock_irqsave(&h->lock, flags); | ||
1352 | if (unlikely(h->lockup_detected)) { | ||
1353 | spin_unlock_irqrestore(&h->lock, flags); | ||
1354 | c->err_info->CommandStatus = CMD_HARDWARE_ERR; | ||
1355 | } else { | ||
1356 | spin_unlock_irqrestore(&h->lock, flags); | ||
1357 | hpsa_scsi_do_simple_cmd_core(h, c); | ||
1358 | } | ||
1359 | } | ||
1360 | |||
1340 | static void hpsa_scsi_do_simple_cmd_with_retry(struct ctlr_info *h, | 1361 | static void hpsa_scsi_do_simple_cmd_with_retry(struct ctlr_info *h, |
1341 | struct CommandList *c, int data_direction) | 1362 | struct CommandList *c, int data_direction) |
1342 | { | 1363 | { |
@@ -2052,8 +2073,14 @@ static int hpsa_scsi_queue_command_lck(struct scsi_cmnd *cmd, | |||
2052 | } | 2073 | } |
2053 | memcpy(scsi3addr, dev->scsi3addr, sizeof(scsi3addr)); | 2074 | memcpy(scsi3addr, dev->scsi3addr, sizeof(scsi3addr)); |
2054 | 2075 | ||
2055 | /* Need a lock as this is being allocated from the pool */ | ||
2056 | spin_lock_irqsave(&h->lock, flags); | 2076 | spin_lock_irqsave(&h->lock, flags); |
2077 | if (unlikely(h->lockup_detected)) { | ||
2078 | spin_unlock_irqrestore(&h->lock, flags); | ||
2079 | cmd->result = DID_ERROR << 16; | ||
2080 | done(cmd); | ||
2081 | return 0; | ||
2082 | } | ||
2083 | /* Need a lock as this is being allocated from the pool */ | ||
2057 | c = cmd_alloc(h); | 2084 | c = cmd_alloc(h); |
2058 | spin_unlock_irqrestore(&h->lock, flags); | 2085 | spin_unlock_irqrestore(&h->lock, flags); |
2059 | if (c == NULL) { /* trouble... */ | 2086 | if (c == NULL) { /* trouble... */ |
@@ -2605,7 +2632,7 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h, void __user *argp) | |||
2605 | c->SG[0].Len = iocommand.buf_size; | 2632 | c->SG[0].Len = iocommand.buf_size; |
2606 | c->SG[0].Ext = 0; /* we are not chaining*/ | 2633 | c->SG[0].Ext = 0; /* we are not chaining*/ |
2607 | } | 2634 | } |
2608 | hpsa_scsi_do_simple_cmd_core(h, c); | 2635 | hpsa_scsi_do_simple_cmd_core_if_no_lockup(h, c); |
2609 | if (iocommand.buf_size > 0) | 2636 | if (iocommand.buf_size > 0) |
2610 | hpsa_pci_unmap(h->pdev, c, 1, PCI_DMA_BIDIRECTIONAL); | 2637 | hpsa_pci_unmap(h->pdev, c, 1, PCI_DMA_BIDIRECTIONAL); |
2611 | check_ioctl_unit_attention(h, c); | 2638 | check_ioctl_unit_attention(h, c); |
@@ -2728,7 +2755,7 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h, void __user *argp) | |||
2728 | c->SG[i].Ext = 0; | 2755 | c->SG[i].Ext = 0; |
2729 | } | 2756 | } |
2730 | } | 2757 | } |
2731 | hpsa_scsi_do_simple_cmd_core(h, c); | 2758 | hpsa_scsi_do_simple_cmd_core_if_no_lockup(h, c); |
2732 | if (sg_used) | 2759 | if (sg_used) |
2733 | hpsa_pci_unmap(h->pdev, c, sg_used, PCI_DMA_BIDIRECTIONAL); | 2760 | hpsa_pci_unmap(h->pdev, c, sg_used, PCI_DMA_BIDIRECTIONAL); |
2734 | check_ioctl_unit_attention(h, c); | 2761 | check_ioctl_unit_attention(h, c); |
@@ -3097,6 +3124,7 @@ static irqreturn_t hpsa_intx_discard_completions(int irq, void *dev_id) | |||
3097 | if (interrupt_not_for_us(h)) | 3124 | if (interrupt_not_for_us(h)) |
3098 | return IRQ_NONE; | 3125 | return IRQ_NONE; |
3099 | spin_lock_irqsave(&h->lock, flags); | 3126 | spin_lock_irqsave(&h->lock, flags); |
3127 | h->last_intr_timestamp = get_jiffies_64(); | ||
3100 | while (interrupt_pending(h)) { | 3128 | while (interrupt_pending(h)) { |
3101 | raw_tag = get_next_completion(h); | 3129 | raw_tag = get_next_completion(h); |
3102 | while (raw_tag != FIFO_EMPTY) | 3130 | while (raw_tag != FIFO_EMPTY) |
@@ -3116,6 +3144,7 @@ static irqreturn_t hpsa_msix_discard_completions(int irq, void *dev_id) | |||
3116 | return IRQ_NONE; | 3144 | return IRQ_NONE; |
3117 | 3145 | ||
3118 | spin_lock_irqsave(&h->lock, flags); | 3146 | spin_lock_irqsave(&h->lock, flags); |
3147 | h->last_intr_timestamp = get_jiffies_64(); | ||
3119 | raw_tag = get_next_completion(h); | 3148 | raw_tag = get_next_completion(h); |
3120 | while (raw_tag != FIFO_EMPTY) | 3149 | while (raw_tag != FIFO_EMPTY) |
3121 | raw_tag = next_command(h); | 3150 | raw_tag = next_command(h); |
@@ -3132,6 +3161,7 @@ static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id) | |||
3132 | if (interrupt_not_for_us(h)) | 3161 | if (interrupt_not_for_us(h)) |
3133 | return IRQ_NONE; | 3162 | return IRQ_NONE; |
3134 | spin_lock_irqsave(&h->lock, flags); | 3163 | spin_lock_irqsave(&h->lock, flags); |
3164 | h->last_intr_timestamp = get_jiffies_64(); | ||
3135 | while (interrupt_pending(h)) { | 3165 | while (interrupt_pending(h)) { |
3136 | raw_tag = get_next_completion(h); | 3166 | raw_tag = get_next_completion(h); |
3137 | while (raw_tag != FIFO_EMPTY) { | 3167 | while (raw_tag != FIFO_EMPTY) { |
@@ -3152,6 +3182,7 @@ static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id) | |||
3152 | u32 raw_tag; | 3182 | u32 raw_tag; |
3153 | 3183 | ||
3154 | spin_lock_irqsave(&h->lock, flags); | 3184 | spin_lock_irqsave(&h->lock, flags); |
3185 | h->last_intr_timestamp = get_jiffies_64(); | ||
3155 | raw_tag = get_next_completion(h); | 3186 | raw_tag = get_next_completion(h); |
3156 | while (raw_tag != FIFO_EMPTY) { | 3187 | while (raw_tag != FIFO_EMPTY) { |
3157 | if (hpsa_tag_contains_index(raw_tag)) | 3188 | if (hpsa_tag_contains_index(raw_tag)) |
@@ -4089,6 +4120,149 @@ static void hpsa_undo_allocations_after_kdump_soft_reset(struct ctlr_info *h) | |||
4089 | kfree(h); | 4120 | kfree(h); |
4090 | } | 4121 | } |
4091 | 4122 | ||
4123 | static void remove_ctlr_from_lockup_detector_list(struct ctlr_info *h) | ||
4124 | { | ||
4125 | assert_spin_locked(&lockup_detector_lock); | ||
4126 | if (!hpsa_lockup_detector) | ||
4127 | return; | ||
4128 | if (h->lockup_detected) | ||
4129 | return; /* already stopped the lockup detector */ | ||
4130 | list_del(&h->lockup_list); | ||
4131 | } | ||
4132 | |||
4133 | /* Called when controller lockup detected. */ | ||
4134 | static void fail_all_cmds_on_list(struct ctlr_info *h, struct list_head *list) | ||
4135 | { | ||
4136 | struct CommandList *c = NULL; | ||
4137 | |||
4138 | assert_spin_locked(&h->lock); | ||
4139 | /* Mark all outstanding commands as failed and complete them. */ | ||
4140 | while (!list_empty(list)) { | ||
4141 | c = list_entry(list->next, struct CommandList, list); | ||
4142 | c->err_info->CommandStatus = CMD_HARDWARE_ERR; | ||
4143 | finish_cmd(c, c->Header.Tag.lower); | ||
4144 | } | ||
4145 | } | ||
4146 | |||
4147 | static void controller_lockup_detected(struct ctlr_info *h) | ||
4148 | { | ||
4149 | unsigned long flags; | ||
4150 | |||
4151 | assert_spin_locked(&lockup_detector_lock); | ||
4152 | remove_ctlr_from_lockup_detector_list(h); | ||
4153 | h->access.set_intr_mask(h, HPSA_INTR_OFF); | ||
4154 | spin_lock_irqsave(&h->lock, flags); | ||
4155 | h->lockup_detected = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET); | ||
4156 | spin_unlock_irqrestore(&h->lock, flags); | ||
4157 | dev_warn(&h->pdev->dev, "Controller lockup detected: 0x%08x\n", | ||
4158 | h->lockup_detected); | ||
4159 | pci_disable_device(h->pdev); | ||
4160 | spin_lock_irqsave(&h->lock, flags); | ||
4161 | fail_all_cmds_on_list(h, &h->cmpQ); | ||
4162 | fail_all_cmds_on_list(h, &h->reqQ); | ||
4163 | spin_unlock_irqrestore(&h->lock, flags); | ||
4164 | } | ||
4165 | |||
4166 | #define HEARTBEAT_SAMPLE_INTERVAL (10 * HZ) | ||
4167 | #define HEARTBEAT_CHECK_MINIMUM_INTERVAL (HEARTBEAT_SAMPLE_INTERVAL / 2) | ||
4168 | |||
4169 | static void detect_controller_lockup(struct ctlr_info *h) | ||
4170 | { | ||
4171 | u64 now; | ||
4172 | u32 heartbeat; | ||
4173 | unsigned long flags; | ||
4174 | |||
4175 | assert_spin_locked(&lockup_detector_lock); | ||
4176 | now = get_jiffies_64(); | ||
4177 | /* If we've received an interrupt recently, we're ok. */ | ||
4178 | if (time_after64(h->last_intr_timestamp + | ||
4179 | (HEARTBEAT_CHECK_MINIMUM_INTERVAL), now)) | ||
4180 | return; | ||
4181 | |||
4182 | /* | ||
4183 | * If we've already checked the heartbeat recently, we're ok. | ||
4184 | * This could happen if someone sends us a signal. We | ||
4185 | * otherwise don't care about signals in this thread. | ||
4186 | */ | ||
4187 | if (time_after64(h->last_heartbeat_timestamp + | ||
4188 | (HEARTBEAT_CHECK_MINIMUM_INTERVAL), now)) | ||
4189 | return; | ||
4190 | |||
4191 | /* If heartbeat has not changed since we last looked, we're not ok. */ | ||
4192 | spin_lock_irqsave(&h->lock, flags); | ||
4193 | heartbeat = readl(&h->cfgtable->HeartBeat); | ||
4194 | spin_unlock_irqrestore(&h->lock, flags); | ||
4195 | if (h->last_heartbeat == heartbeat) { | ||
4196 | controller_lockup_detected(h); | ||
4197 | return; | ||
4198 | } | ||
4199 | |||
4200 | /* We're ok. */ | ||
4201 | h->last_heartbeat = heartbeat; | ||
4202 | h->last_heartbeat_timestamp = now; | ||
4203 | } | ||
4204 | |||
4205 | static int detect_controller_lockup_thread(void *notused) | ||
4206 | { | ||
4207 | struct ctlr_info *h; | ||
4208 | unsigned long flags; | ||
4209 | |||
4210 | while (1) { | ||
4211 | struct list_head *this, *tmp; | ||
4212 | |||
4213 | schedule_timeout_interruptible(HEARTBEAT_SAMPLE_INTERVAL); | ||
4214 | if (kthread_should_stop()) | ||
4215 | break; | ||
4216 | spin_lock_irqsave(&lockup_detector_lock, flags); | ||
4217 | list_for_each_safe(this, tmp, &hpsa_ctlr_list) { | ||
4218 | h = list_entry(this, struct ctlr_info, lockup_list); | ||
4219 | detect_controller_lockup(h); | ||
4220 | } | ||
4221 | spin_unlock_irqrestore(&lockup_detector_lock, flags); | ||
4222 | } | ||
4223 | return 0; | ||
4224 | } | ||
4225 | |||
4226 | static void add_ctlr_to_lockup_detector_list(struct ctlr_info *h) | ||
4227 | { | ||
4228 | unsigned long flags; | ||
4229 | |||
4230 | spin_lock_irqsave(&lockup_detector_lock, flags); | ||
4231 | list_add_tail(&h->lockup_list, &hpsa_ctlr_list); | ||
4232 | spin_unlock_irqrestore(&lockup_detector_lock, flags); | ||
4233 | } | ||
4234 | |||
4235 | static void start_controller_lockup_detector(struct ctlr_info *h) | ||
4236 | { | ||
4237 | /* Start the lockup detector thread if not already started */ | ||
4238 | if (!hpsa_lockup_detector) { | ||
4239 | spin_lock_init(&lockup_detector_lock); | ||
4240 | hpsa_lockup_detector = | ||
4241 | kthread_run(detect_controller_lockup_thread, | ||
4242 | NULL, "hpsa"); | ||
4243 | } | ||
4244 | if (!hpsa_lockup_detector) { | ||
4245 | dev_warn(&h->pdev->dev, | ||
4246 | "Could not start lockup detector thread\n"); | ||
4247 | return; | ||
4248 | } | ||
4249 | add_ctlr_to_lockup_detector_list(h); | ||
4250 | } | ||
4251 | |||
4252 | static void stop_controller_lockup_detector(struct ctlr_info *h) | ||
4253 | { | ||
4254 | unsigned long flags; | ||
4255 | |||
4256 | spin_lock_irqsave(&lockup_detector_lock, flags); | ||
4257 | remove_ctlr_from_lockup_detector_list(h); | ||
4258 | /* If the list of ctlr's to monitor is empty, stop the thread */ | ||
4259 | if (list_empty(&hpsa_ctlr_list)) { | ||
4260 | kthread_stop(hpsa_lockup_detector); | ||
4261 | hpsa_lockup_detector = NULL; | ||
4262 | } | ||
4263 | spin_unlock_irqrestore(&lockup_detector_lock, flags); | ||
4264 | } | ||
4265 | |||
4092 | static int __devinit hpsa_init_one(struct pci_dev *pdev, | 4266 | static int __devinit hpsa_init_one(struct pci_dev *pdev, |
4093 | const struct pci_device_id *ent) | 4267 | const struct pci_device_id *ent) |
4094 | { | 4268 | { |
@@ -4234,6 +4408,7 @@ reinit_after_soft_reset: | |||
4234 | 4408 | ||
4235 | hpsa_hba_inquiry(h); | 4409 | hpsa_hba_inquiry(h); |
4236 | hpsa_register_scsi(h); /* hook ourselves into SCSI subsystem */ | 4410 | hpsa_register_scsi(h); /* hook ourselves into SCSI subsystem */ |
4411 | start_controller_lockup_detector(h); | ||
4237 | return 1; | 4412 | return 1; |
4238 | 4413 | ||
4239 | clean4: | 4414 | clean4: |
@@ -4296,10 +4471,11 @@ static void __devexit hpsa_remove_one(struct pci_dev *pdev) | |||
4296 | struct ctlr_info *h; | 4471 | struct ctlr_info *h; |
4297 | 4472 | ||
4298 | if (pci_get_drvdata(pdev) == NULL) { | 4473 | if (pci_get_drvdata(pdev) == NULL) { |
4299 | dev_err(&pdev->dev, "unable to remove device \n"); | 4474 | dev_err(&pdev->dev, "unable to remove device\n"); |
4300 | return; | 4475 | return; |
4301 | } | 4476 | } |
4302 | h = pci_get_drvdata(pdev); | 4477 | h = pci_get_drvdata(pdev); |
4478 | stop_controller_lockup_detector(h); | ||
4303 | hpsa_unregister_scsi(h); /* unhook from SCSI subsystem */ | 4479 | hpsa_unregister_scsi(h); /* unhook from SCSI subsystem */ |
4304 | hpsa_shutdown(pdev); | 4480 | hpsa_shutdown(pdev); |
4305 | iounmap(h->vaddr); | 4481 | iounmap(h->vaddr); |
diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h index 73858bc22e57..91edafb8c7e6 100644 --- a/drivers/scsi/hpsa.h +++ b/drivers/scsi/hpsa.h | |||
@@ -121,6 +121,11 @@ struct ctlr_info { | |||
121 | unsigned char reply_pool_wraparound; | 121 | unsigned char reply_pool_wraparound; |
122 | u32 *blockFetchTable; | 122 | u32 *blockFetchTable; |
123 | unsigned char *hba_inquiry_data; | 123 | unsigned char *hba_inquiry_data; |
124 | u64 last_intr_timestamp; | ||
125 | u32 last_heartbeat; | ||
126 | u64 last_heartbeat_timestamp; | ||
127 | u32 lockup_detected; | ||
128 | struct list_head lockup_list; | ||
124 | }; | 129 | }; |
125 | #define HPSA_ABORT_MSG 0 | 130 | #define HPSA_ABORT_MSG 0 |
126 | #define HPSA_DEVICE_RESET_MSG 1 | 131 | #define HPSA_DEVICE_RESET_MSG 1 |