diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/platforms/pseries/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 138 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_event.c | 155 |
3 files changed, 171 insertions, 124 deletions
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index dbdffb2fe429..27515476ad6c 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile | |||
@@ -3,4 +3,4 @@ obj-y := pci.o lpar.o hvCall.o nvram.o reconfig.o \ | |||
3 | obj-$(CONFIG_SMP) += smp.o | 3 | obj-$(CONFIG_SMP) += smp.o |
4 | obj-$(CONFIG_IBMVIO) += vio.o | 4 | obj-$(CONFIG_IBMVIO) += vio.o |
5 | obj-$(CONFIG_XICS) += xics.o | 5 | obj-$(CONFIG_XICS) += xics.o |
6 | obj-$(CONFIG_EEH) += eeh.o | 6 | obj-$(CONFIG_EEH) += eeh.o eeh_event.o |
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 9df1d5018363..1fec99d53311 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c | |||
@@ -19,7 +19,6 @@ | |||
19 | 19 | ||
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/list.h> | 21 | #include <linux/list.h> |
22 | #include <linux/notifier.h> | ||
23 | #include <linux/pci.h> | 22 | #include <linux/pci.h> |
24 | #include <linux/proc_fs.h> | 23 | #include <linux/proc_fs.h> |
25 | #include <linux/rbtree.h> | 24 | #include <linux/rbtree.h> |
@@ -27,12 +26,12 @@ | |||
27 | #include <linux/spinlock.h> | 26 | #include <linux/spinlock.h> |
28 | #include <asm/atomic.h> | 27 | #include <asm/atomic.h> |
29 | #include <asm/eeh.h> | 28 | #include <asm/eeh.h> |
29 | #include <asm/eeh_event.h> | ||
30 | #include <asm/io.h> | 30 | #include <asm/io.h> |
31 | #include <asm/machdep.h> | 31 | #include <asm/machdep.h> |
32 | #include <asm/ppc-pci.h> | ||
32 | #include <asm/rtas.h> | 33 | #include <asm/rtas.h> |
33 | #include <asm/atomic.h> | ||
34 | #include <asm/systemcfg.h> | 34 | #include <asm/systemcfg.h> |
35 | #include <asm/ppc-pci.h> | ||
36 | 35 | ||
37 | #undef DEBUG | 36 | #undef DEBUG |
38 | 37 | ||
@@ -70,14 +69,6 @@ | |||
70 | * and sent out for processing. | 69 | * and sent out for processing. |
71 | */ | 70 | */ |
72 | 71 | ||
73 | /* EEH event workqueue setup. */ | ||
74 | static DEFINE_SPINLOCK(eeh_eventlist_lock); | ||
75 | LIST_HEAD(eeh_eventlist); | ||
76 | static void eeh_event_handler(void *); | ||
77 | DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL); | ||
78 | |||
79 | static struct notifier_block *eeh_notifier_chain; | ||
80 | |||
81 | /* If a device driver keeps reading an MMIO register in an interrupt | 72 | /* If a device driver keeps reading an MMIO register in an interrupt |
82 | * handler after a slot isolation event has occurred, we assume it | 73 | * handler after a slot isolation event has occurred, we assume it |
83 | * is broken and panic. This sets the threshold for how many read | 74 | * is broken and panic. This sets the threshold for how many read |
@@ -421,24 +412,6 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity) | |||
421 | } | 412 | } |
422 | 413 | ||
423 | /** | 414 | /** |
424 | * eeh_register_notifier - Register to find out about EEH events. | ||
425 | * @nb: notifier block to callback on events | ||
426 | */ | ||
427 | int eeh_register_notifier(struct notifier_block *nb) | ||
428 | { | ||
429 | return notifier_chain_register(&eeh_notifier_chain, nb); | ||
430 | } | ||
431 | |||
432 | /** | ||
433 | * eeh_unregister_notifier - Unregister to an EEH event notifier. | ||
434 | * @nb: notifier block to callback on events | ||
435 | */ | ||
436 | int eeh_unregister_notifier(struct notifier_block *nb) | ||
437 | { | ||
438 | return notifier_chain_unregister(&eeh_notifier_chain, nb); | ||
439 | } | ||
440 | |||
441 | /** | ||
442 | * read_slot_reset_state - Read the reset state of a device node's slot | 415 | * read_slot_reset_state - Read the reset state of a device node's slot |
443 | * @dn: device node to read | 416 | * @dn: device node to read |
444 | * @rets: array to return results in | 417 | * @rets: array to return results in |
@@ -461,73 +434,6 @@ static int read_slot_reset_state(struct pci_dn *pdn, int rets[]) | |||
461 | } | 434 | } |
462 | 435 | ||
463 | /** | 436 | /** |
464 | * eeh_panic - call panic() for an eeh event that cannot be handled. | ||
465 | * The philosophy of this routine is that it is better to panic and | ||
466 | * halt the OS than it is to risk possible data corruption by | ||
467 | * oblivious device drivers that don't know better. | ||
468 | * | ||
469 | * @dev pci device that had an eeh event | ||
470 | * @reset_state current reset state of the device slot | ||
471 | */ | ||
472 | static void eeh_panic(struct pci_dev *dev, int reset_state) | ||
473 | { | ||
474 | /* | ||
475 | * XXX We should create a separate sysctl for this. | ||
476 | * | ||
477 | * Since the panic_on_oops sysctl is used to halt the system | ||
478 | * in light of potential corruption, we can use it here. | ||
479 | */ | ||
480 | if (panic_on_oops) { | ||
481 | struct device_node *dn = pci_device_to_OF_node(dev); | ||
482 | eeh_slot_error_detail (PCI_DN(dn), 2 /* Permanent Error */); | ||
483 | panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, | ||
484 | pci_name(dev)); | ||
485 | } | ||
486 | else { | ||
487 | __get_cpu_var(ignored_failures)++; | ||
488 | printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", | ||
489 | reset_state, pci_name(dev)); | ||
490 | } | ||
491 | } | ||
492 | |||
493 | /** | ||
494 | * eeh_event_handler - dispatch EEH events. The detection of a frozen | ||
495 | * slot can occur inside an interrupt, where it can be hard to do | ||
496 | * anything about it. The goal of this routine is to pull these | ||
497 | * detection events out of the context of the interrupt handler, and | ||
498 | * re-dispatch them for processing at a later time in a normal context. | ||
499 | * | ||
500 | * @dummy - unused | ||
501 | */ | ||
502 | static void eeh_event_handler(void *dummy) | ||
503 | { | ||
504 | unsigned long flags; | ||
505 | struct eeh_event *event; | ||
506 | |||
507 | while (1) { | ||
508 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | ||
509 | event = NULL; | ||
510 | if (!list_empty(&eeh_eventlist)) { | ||
511 | event = list_entry(eeh_eventlist.next, struct eeh_event, list); | ||
512 | list_del(&event->list); | ||
513 | } | ||
514 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | ||
515 | if (event == NULL) | ||
516 | break; | ||
517 | |||
518 | printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device " | ||
519 | "%s\n", event->reset_state, | ||
520 | pci_name(event->dev)); | ||
521 | |||
522 | notifier_call_chain (&eeh_notifier_chain, | ||
523 | EEH_NOTIFY_FREEZE, event); | ||
524 | |||
525 | pci_dev_put(event->dev); | ||
526 | kfree(event); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /** | ||
531 | * eeh_token_to_phys - convert EEH address token to phys address | 437 | * eeh_token_to_phys - convert EEH address token to phys address |
532 | * @token i/o token, should be address in the form 0xA.... | 438 | * @token i/o token, should be address in the form 0xA.... |
533 | */ | 439 | */ |
@@ -613,8 +519,6 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
613 | int ret; | 519 | int ret; |
614 | int rets[3]; | 520 | int rets[3]; |
615 | unsigned long flags; | 521 | unsigned long flags; |
616 | int reset_state; | ||
617 | struct eeh_event *event; | ||
618 | struct pci_dn *pdn; | 522 | struct pci_dn *pdn; |
619 | struct device_node *pe_dn; | 523 | struct device_node *pe_dn; |
620 | int rc = 0; | 524 | int rc = 0; |
@@ -722,33 +626,12 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) | |||
722 | __eeh_mark_slot (pe_dn); | 626 | __eeh_mark_slot (pe_dn); |
723 | spin_unlock_irqrestore(&confirm_error_lock, flags); | 627 | spin_unlock_irqrestore(&confirm_error_lock, flags); |
724 | 628 | ||
725 | reset_state = rets[0]; | 629 | eeh_send_failure_event (dn, dev, rets[0], rets[2]); |
726 | 630 | ||
727 | eeh_slot_error_detail (pdn, 1 /* Temporary Error */); | ||
728 | |||
729 | printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n", | ||
730 | rets[0], dn->name, dn->full_name); | ||
731 | event = kmalloc(sizeof(*event), GFP_ATOMIC); | ||
732 | if (event == NULL) { | ||
733 | eeh_panic(dev, reset_state); | ||
734 | return 1; | ||
735 | } | ||
736 | |||
737 | event->dev = dev; | ||
738 | event->dn = dn; | ||
739 | event->reset_state = reset_state; | ||
740 | |||
741 | /* We may or may not be called in an interrupt context */ | ||
742 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | ||
743 | list_add(&event->list, &eeh_eventlist); | ||
744 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | ||
745 | |||
746 | /* Most EEH events are due to device driver bugs. Having | 631 | /* Most EEH events are due to device driver bugs. Having |
747 | * a stack trace will help the device-driver authors figure | 632 | * a stack trace will help the device-driver authors figure |
748 | * out what happened. So print that out. */ | 633 | * out what happened. So print that out. */ |
749 | if (rets[0] != 5) dump_stack(); | 634 | if (rets[0] != 5) dump_stack(); |
750 | schedule_work(&eeh_event_wq); | ||
751 | |||
752 | return 1; | 635 | return 1; |
753 | 636 | ||
754 | dn_unlock: | 637 | dn_unlock: |
@@ -793,6 +676,14 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon | |||
793 | 676 | ||
794 | EXPORT_SYMBOL(eeh_check_failure); | 677 | EXPORT_SYMBOL(eeh_check_failure); |
795 | 678 | ||
679 | /* ------------------------------------------------------------- */ | ||
680 | /* The code below deals with enabling EEH for devices during the | ||
681 | * early boot sequence. EEH must be enabled before any PCI probing | ||
682 | * can be done. | ||
683 | */ | ||
684 | |||
685 | #define EEH_ENABLE 1 | ||
686 | |||
796 | struct eeh_early_enable_info { | 687 | struct eeh_early_enable_info { |
797 | unsigned int buid_hi; | 688 | unsigned int buid_hi; |
798 | unsigned int buid_lo; | 689 | unsigned int buid_lo; |
@@ -850,8 +741,9 @@ static void *early_enable_eeh(struct device_node *dn, void *data) | |||
850 | /* First register entry is addr (00BBSS00) */ | 741 | /* First register entry is addr (00BBSS00) */ |
851 | /* Try to enable eeh */ | 742 | /* Try to enable eeh */ |
852 | ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, | 743 | ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, |
853 | regs[0], info->buid_hi, info->buid_lo, | 744 | regs[0], info->buid_hi, info->buid_lo, |
854 | EEH_ENABLE); | 745 | EEH_ENABLE); |
746 | |||
855 | if (ret == 0) { | 747 | if (ret == 0) { |
856 | eeh_subsystem_enabled = 1; | 748 | eeh_subsystem_enabled = 1; |
857 | pdn->eeh_mode |= EEH_MODE_SUPPORTED; | 749 | pdn->eeh_mode |= EEH_MODE_SUPPORTED; |
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c new file mode 100644 index 000000000000..92497333c2b6 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_event.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * eeh_event.c | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Copyright (c) 2005 Linas Vepstas <linas@linas.org> | ||
19 | */ | ||
20 | |||
21 | #include <linux/list.h> | ||
22 | #include <linux/pci.h> | ||
23 | #include <asm/eeh_event.h> | ||
24 | |||
25 | /** Overview: | ||
26 | * EEH error states may be detected within exception handlers; | ||
27 | * however, the recovery processing needs to occur asynchronously | ||
28 | * in a normal kernel context and not an interrupt context. | ||
29 | * This pair of routines creates an event and queues it onto a | ||
30 | * work-queue, where a worker thread can drive recovery. | ||
31 | */ | ||
32 | |||
33 | /* EEH event workqueue setup. */ | ||
34 | static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED; | ||
35 | LIST_HEAD(eeh_eventlist); | ||
36 | static void eeh_thread_launcher(void *); | ||
37 | DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); | ||
38 | |||
39 | /** | ||
40 | * eeh_panic - call panic() for an eeh event that cannot be handled. | ||
41 | * The philosophy of this routine is that it is better to panic and | ||
42 | * halt the OS than it is to risk possible data corruption by | ||
43 | * oblivious device drivers that don't know better. | ||
44 | * | ||
45 | * @dev pci device that had an eeh event | ||
46 | * @reset_state current reset state of the device slot | ||
47 | */ | ||
48 | static void eeh_panic(struct pci_dev *dev, int reset_state) | ||
49 | { | ||
50 | /* | ||
51 | * Since the panic_on_oops sysctl is used to halt the system | ||
52 | * in light of potential corruption, we can use it here. | ||
53 | */ | ||
54 | if (panic_on_oops) { | ||
55 | panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, | ||
56 | pci_name(dev)); | ||
57 | } | ||
58 | else { | ||
59 | printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", | ||
60 | reset_state, pci_name(dev)); | ||
61 | } | ||
62 | } | ||
63 | |||
64 | /** | ||
65 | * eeh_event_handler - dispatch EEH events. The detection of a frozen | ||
66 | * slot can occur inside an interrupt, where it can be hard to do | ||
67 | * anything about it. The goal of this routine is to pull these | ||
68 | * detection events out of the context of the interrupt handler, and | ||
69 | * re-dispatch them for processing at a later time in a normal context. | ||
70 | * | ||
71 | * @dummy - unused | ||
72 | */ | ||
73 | static int eeh_event_handler(void * dummy) | ||
74 | { | ||
75 | unsigned long flags; | ||
76 | struct eeh_event *event; | ||
77 | |||
78 | daemonize ("eehd"); | ||
79 | |||
80 | while (1) { | ||
81 | set_current_state(TASK_INTERRUPTIBLE); | ||
82 | |||
83 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | ||
84 | event = NULL; | ||
85 | if (!list_empty(&eeh_eventlist)) { | ||
86 | event = list_entry(eeh_eventlist.next, struct eeh_event, list); | ||
87 | list_del(&event->list); | ||
88 | } | ||
89 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | ||
90 | if (event == NULL) | ||
91 | break; | ||
92 | |||
93 | printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", | ||
94 | pci_name(event->dev)); | ||
95 | |||
96 | eeh_panic (event->dev, event->state); | ||
97 | |||
98 | kfree(event); | ||
99 | } | ||
100 | |||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | /** | ||
105 | * eeh_thread_launcher | ||
106 | * | ||
107 | * @dummy - unused | ||
108 | */ | ||
109 | static void eeh_thread_launcher(void *dummy) | ||
110 | { | ||
111 | if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0) | ||
112 | printk(KERN_ERR "Failed to start EEH daemon\n"); | ||
113 | } | ||
114 | |||
115 | /** | ||
116 | * eeh_send_failure_event - generate a PCI error event | ||
117 | * @dev pci device | ||
118 | * | ||
119 | * This routine can be called within an interrupt context; | ||
120 | * the actual event will be delivered in a normal context | ||
121 | * (from a workqueue). | ||
122 | */ | ||
123 | int eeh_send_failure_event (struct device_node *dn, | ||
124 | struct pci_dev *dev, | ||
125 | int state, | ||
126 | int time_unavail) | ||
127 | { | ||
128 | unsigned long flags; | ||
129 | struct eeh_event *event; | ||
130 | |||
131 | event = kmalloc(sizeof(*event), GFP_ATOMIC); | ||
132 | if (event == NULL) { | ||
133 | printk (KERN_ERR "EEH: out of memory, event not handled\n"); | ||
134 | return 1; | ||
135 | } | ||
136 | |||
137 | if (dev) | ||
138 | pci_dev_get(dev); | ||
139 | |||
140 | event->dn = dn; | ||
141 | event->dev = dev; | ||
142 | event->state = state; | ||
143 | event->time_unavail = time_unavail; | ||
144 | |||
145 | /* We may or may not be called in an interrupt context */ | ||
146 | spin_lock_irqsave(&eeh_eventlist_lock, flags); | ||
147 | list_add(&event->list, &eeh_eventlist); | ||
148 | spin_unlock_irqrestore(&eeh_eventlist_lock, flags); | ||
149 | |||
150 | schedule_work(&eeh_event_wq); | ||
151 | |||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /********************** END OF FILE ******************************/ | ||