aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/pseries/eeh.c
diff options
context:
space:
mode:
authorLinas Vepstas <linas@linas.org>2005-11-03 19:50:04 -0500
committerPaul Mackerras <paulus@samba.org>2005-11-09 19:38:05 -0500
commit172ca9261800bacbbc7d320d9924d9b482dff8de (patch)
tree7abd6ddf1e6b9a147a0826c374f0d1bca80806d3 /arch/powerpc/platforms/pseries/eeh.c
parent7f79da7accd63a6adb84f4602f66779f6a701e7b (diff)
[PATCH] ppc64: PCI error event dispatcher
12-eeh-event-dispatcher.patch ppc64: EEH Recovery dispatcher thread This patch adds a mechanism to create recovery threads when an EEH event is received. Since an EEH freeze state may be detected within an interrupt context, we need to get out of the interrupt context before starting recovery. This dispatcher does this in two steps: first, it uses a workqueue to get out, and then lanuches a kernel thread, so that the recovery routine can sleep for exteded periods without upseting the keventd. A kernel thread is created with each EEH event, rather than having one long-running daemon started at boot time. This is because it is anticipated that EEH events will be very rare (very very rare, ideally) and so its pointless to cluter the process tables with a daemon that will almost never run. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/platforms/pseries/eeh.c')
-rw-r--r--arch/powerpc/platforms/pseries/eeh.c138
1 files changed, 15 insertions, 123 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 9df1d5018363..1fec99d53311 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -19,7 +19,6 @@
19 19
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/notifier.h>
23#include <linux/pci.h> 22#include <linux/pci.h>
24#include <linux/proc_fs.h> 23#include <linux/proc_fs.h>
25#include <linux/rbtree.h> 24#include <linux/rbtree.h>
@@ -27,12 +26,12 @@
27#include <linux/spinlock.h> 26#include <linux/spinlock.h>
28#include <asm/atomic.h> 27#include <asm/atomic.h>
29#include <asm/eeh.h> 28#include <asm/eeh.h>
29#include <asm/eeh_event.h>
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/machdep.h> 31#include <asm/machdep.h>
32#include <asm/ppc-pci.h>
32#include <asm/rtas.h> 33#include <asm/rtas.h>
33#include <asm/atomic.h>
34#include <asm/systemcfg.h> 34#include <asm/systemcfg.h>
35#include <asm/ppc-pci.h>
36 35
37#undef DEBUG 36#undef DEBUG
38 37
@@ -70,14 +69,6 @@
70 * and sent out for processing. 69 * and sent out for processing.
71 */ 70 */
72 71
73/* EEH event workqueue setup. */
74static DEFINE_SPINLOCK(eeh_eventlist_lock);
75LIST_HEAD(eeh_eventlist);
76static void eeh_event_handler(void *);
77DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
78
79static struct notifier_block *eeh_notifier_chain;
80
81/* If a device driver keeps reading an MMIO register in an interrupt 72/* If a device driver keeps reading an MMIO register in an interrupt
82 * handler after a slot isolation event has occurred, we assume it 73 * handler after a slot isolation event has occurred, we assume it
83 * is broken and panic. This sets the threshold for how many read 74 * is broken and panic. This sets the threshold for how many read
@@ -421,24 +412,6 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
421} 412}
422 413
423/** 414/**
424 * eeh_register_notifier - Register to find out about EEH events.
425 * @nb: notifier block to callback on events
426 */
427int eeh_register_notifier(struct notifier_block *nb)
428{
429 return notifier_chain_register(&eeh_notifier_chain, nb);
430}
431
432/**
433 * eeh_unregister_notifier - Unregister to an EEH event notifier.
434 * @nb: notifier block to callback on events
435 */
436int eeh_unregister_notifier(struct notifier_block *nb)
437{
438 return notifier_chain_unregister(&eeh_notifier_chain, nb);
439}
440
441/**
442 * read_slot_reset_state - Read the reset state of a device node's slot 415 * read_slot_reset_state - Read the reset state of a device node's slot
443 * @dn: device node to read 416 * @dn: device node to read
444 * @rets: array to return results in 417 * @rets: array to return results in
@@ -461,73 +434,6 @@ static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
461} 434}
462 435
463/** 436/**
464 * eeh_panic - call panic() for an eeh event that cannot be handled.
465 * The philosophy of this routine is that it is better to panic and
466 * halt the OS than it is to risk possible data corruption by
467 * oblivious device drivers that don't know better.
468 *
469 * @dev pci device that had an eeh event
470 * @reset_state current reset state of the device slot
471 */
472static void eeh_panic(struct pci_dev *dev, int reset_state)
473{
474 /*
475 * XXX We should create a separate sysctl for this.
476 *
477 * Since the panic_on_oops sysctl is used to halt the system
478 * in light of potential corruption, we can use it here.
479 */
480 if (panic_on_oops) {
481 struct device_node *dn = pci_device_to_OF_node(dev);
482 eeh_slot_error_detail (PCI_DN(dn), 2 /* Permanent Error */);
483 panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
484 pci_name(dev));
485 }
486 else {
487 __get_cpu_var(ignored_failures)++;
488 printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
489 reset_state, pci_name(dev));
490 }
491}
492
493/**
494 * eeh_event_handler - dispatch EEH events. The detection of a frozen
495 * slot can occur inside an interrupt, where it can be hard to do
496 * anything about it. The goal of this routine is to pull these
497 * detection events out of the context of the interrupt handler, and
498 * re-dispatch them for processing at a later time in a normal context.
499 *
500 * @dummy - unused
501 */
502static void eeh_event_handler(void *dummy)
503{
504 unsigned long flags;
505 struct eeh_event *event;
506
507 while (1) {
508 spin_lock_irqsave(&eeh_eventlist_lock, flags);
509 event = NULL;
510 if (!list_empty(&eeh_eventlist)) {
511 event = list_entry(eeh_eventlist.next, struct eeh_event, list);
512 list_del(&event->list);
513 }
514 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
515 if (event == NULL)
516 break;
517
518 printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device "
519 "%s\n", event->reset_state,
520 pci_name(event->dev));
521
522 notifier_call_chain (&eeh_notifier_chain,
523 EEH_NOTIFY_FREEZE, event);
524
525 pci_dev_put(event->dev);
526 kfree(event);
527 }
528}
529
530/**
531 * eeh_token_to_phys - convert EEH address token to phys address 437 * eeh_token_to_phys - convert EEH address token to phys address
532 * @token i/o token, should be address in the form 0xA.... 438 * @token i/o token, should be address in the form 0xA....
533 */ 439 */
@@ -613,8 +519,6 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
613 int ret; 519 int ret;
614 int rets[3]; 520 int rets[3];
615 unsigned long flags; 521 unsigned long flags;
616 int reset_state;
617 struct eeh_event *event;
618 struct pci_dn *pdn; 522 struct pci_dn *pdn;
619 struct device_node *pe_dn; 523 struct device_node *pe_dn;
620 int rc = 0; 524 int rc = 0;
@@ -722,33 +626,12 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
722 __eeh_mark_slot (pe_dn); 626 __eeh_mark_slot (pe_dn);
723 spin_unlock_irqrestore(&confirm_error_lock, flags); 627 spin_unlock_irqrestore(&confirm_error_lock, flags);
724 628
725 reset_state = rets[0]; 629 eeh_send_failure_event (dn, dev, rets[0], rets[2]);
726 630
727 eeh_slot_error_detail (pdn, 1 /* Temporary Error */);
728
729 printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
730 rets[0], dn->name, dn->full_name);
731 event = kmalloc(sizeof(*event), GFP_ATOMIC);
732 if (event == NULL) {
733 eeh_panic(dev, reset_state);
734 return 1;
735 }
736
737 event->dev = dev;
738 event->dn = dn;
739 event->reset_state = reset_state;
740
741 /* We may or may not be called in an interrupt context */
742 spin_lock_irqsave(&eeh_eventlist_lock, flags);
743 list_add(&event->list, &eeh_eventlist);
744 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
745
746 /* Most EEH events are due to device driver bugs. Having 631 /* Most EEH events are due to device driver bugs. Having
747 * a stack trace will help the device-driver authors figure 632 * a stack trace will help the device-driver authors figure
748 * out what happened. So print that out. */ 633 * out what happened. So print that out. */
749 if (rets[0] != 5) dump_stack(); 634 if (rets[0] != 5) dump_stack();
750 schedule_work(&eeh_event_wq);
751
752 return 1; 635 return 1;
753 636
754dn_unlock: 637dn_unlock:
@@ -793,6 +676,14 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon
793 676
794EXPORT_SYMBOL(eeh_check_failure); 677EXPORT_SYMBOL(eeh_check_failure);
795 678
679/* ------------------------------------------------------------- */
680/* The code below deals with enabling EEH for devices during the
681 * early boot sequence. EEH must be enabled before any PCI probing
682 * can be done.
683 */
684
685#define EEH_ENABLE 1
686
796struct eeh_early_enable_info { 687struct eeh_early_enable_info {
797 unsigned int buid_hi; 688 unsigned int buid_hi;
798 unsigned int buid_lo; 689 unsigned int buid_lo;
@@ -850,8 +741,9 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
850 /* First register entry is addr (00BBSS00) */ 741 /* First register entry is addr (00BBSS00) */
851 /* Try to enable eeh */ 742 /* Try to enable eeh */
852 ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, 743 ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
853 regs[0], info->buid_hi, info->buid_lo, 744 regs[0], info->buid_hi, info->buid_lo,
854 EEH_ENABLE); 745 EEH_ENABLE);
746
855 if (ret == 0) { 747 if (ret == 0) {
856 eeh_subsystem_enabled = 1; 748 eeh_subsystem_enabled = 1;
857 pdn->eeh_mode |= EEH_MODE_SUPPORTED; 749 pdn->eeh_mode |= EEH_MODE_SUPPORTED;