4 files changed, 109 insertions, 46 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 5d1b743dbe7e..5b3da4b4ea79 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -10,7 +10,7 @@ config PPC_PSERIES
        select RTAS_ERROR_LOGGING
        select PPC_UDBG_16550
        select PPC_NATIVE
-        select PPC_PCI_CHOICE if EMBEDDED
+        select PPC_PCI_CHOICE if EXPERT
        default y
 config PPC_SPLPAR
@@ -24,9 +24,9 @@ config PPC_SPLPAR
          two or more partitions.
 config EEH
-        bool "PCI Extended Error Handling (EEH)" if EMBEDDED
+        bool "PCI Extended Error Handling (EEH)" if EXPERT
        depends on PPC_PSERIES && PCI
-        default y if !EMBEDDED
+        default y if !EXPERT
 config PSERIES_MSI
       bool
diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c
index 53cbd53d8740..77d38a5e2ff9 100644
--- a/arch/powerpc/platforms/pseries/kexec.c
+++ b/arch/powerpc/platforms/pseries/kexec.c
@@ -61,13 +61,3 @@ void __init setup_kexec_cpu_down_xics(void)
 {
        ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
 }
-static int __init pseries_kexec_setup(void)
-{
-        ppc_md.machine_kexec = default_machine_kexec;
-        ppc_md.machine_kexec_prepare = default_machine_kexec_prepare;
-        ppc_md.machine_crash_shutdown = default_machine_crash_shutdown;
-        return 0;
-}
-machine_device_initcall(pseries, pseries_kexec_setup);
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5d3ea9f60dd7..ca5d5898d320 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -713,6 +713,13 @@ EXPORT_SYMBOL(arch_free_page);
 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
 extern long hcall_tracepoint_refcount;
+/* 
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
 void hcall_tracepoint_regfunc(void)
 {
        hcall_tracepoint_refcount++;
@@ -725,12 +732,42 @@ void hcall_tracepoint_unregfunc(void)
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
 {
+        unsigned long flags;
+        unsigned int *depth;
+        local_irq_save(flags);
+        depth = &__get_cpu_var(hcall_trace_depth);
+        if (*depth)
+                goto out;
+        (*depth)++;
        trace_hcall_entry(opcode, args);
+        (*depth)--;
+out:
+        local_irq_restore(flags);
 }
 void __trace_hcall_exit(long opcode, unsigned long retval,
                        unsigned long *retbuf)
 {
+        unsigned long flags;
+        unsigned int *depth;
+        local_irq_save(flags);
+        depth = &__get_cpu_var(hcall_trace_depth);
+        if (*depth)
+                goto out;
+        (*depth)++;
        trace_hcall_exit(opcode, retval, retbuf);
+        (*depth)--;
+out:
+        local_irq_restore(flags);
 }
 #endif
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index a4fc6da87c2e..c55d7ad9c648 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -54,7 +54,8 @@
 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
 static DEFINE_SPINLOCK(ras_log_buf_lock);
-static char mce_data_buf[RTAS_ERROR_LOG_MAX];
+static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_PER_CPU(__u64, mce_data_buf);
 static int ras_get_sensor_state_token;
 static int ras_check_exception_token;
@@ -196,12 +197,24 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
-/* Get the error information for errors coming through the
+/*
+ * Some versions of FWNMI place the buffer inside the 4kB page starting at
+ * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ */
+#define VALID_FWNMI_BUFFER(A) \
+        ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
+        (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
+/*
+ * Get the error information for errors coming through the
 * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
 * the actual r3 if possible, and a ptr to the error log entry
 * will be returned if found.
 *
- * The mce_data_buf does not have any locks or protection around it,
+ * If the RTAS error is not of the extended type, then we put it in a per
+ * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
+ *
+ * The global_mce_data_buf does not have any locks or protection around it,
 * if a second machine check comes in, or a system reset is done
 * before we have logged the error, then we will get corruption in the
 * error log.  This is preferable over holding off on calling
@@ -210,20 +223,31 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
 */
 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
 {
-        unsigned long errdata = regs->gpr[3];
-        struct rtas_error_log *errhdr = NULL;
        unsigned long *savep;
+        struct rtas_error_log *h, *errhdr = NULL;
+        if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+                printk(KERN_ERR "FWNMI: corrupt r3\n");
+                return NULL;
+        }
-        if ((errdata >= 0x7000 && errdata < 0x7fff0) ||
+        savep = __va(regs->gpr[3]);
-            (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {
+        regs->gpr[3] = savep[0];        /* restore original r3 */
-                savep = __va(errdata);
-                regs->gpr[3] = savep[0];        /* restore original r3 */
+        /* If it isn't an extended log we can use the per cpu 64bit buffer */
-                memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+        h = (struct rtas_error_log *)&savep[1];
-                memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);
+        if (!h->extended) {
-                errhdr = (struct rtas_error_log *)mce_data_buf;
+                memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
+                errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
        } else {
-                printk("FWNMI: corrupt r3\n");
+                int len;
+                len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX);
+                memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+                memcpy(global_mce_data_buf, h, len);
+                errhdr = (struct rtas_error_log *)global_mce_data_buf;
        }
        return errhdr;
 }
@@ -235,7 +259,7 @@ static void fwnmi_release_errinfo(void)
 {
        int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
        if (ret != 0)
-                printk("FWNMI: nmi-interlock failed: %d\n", ret);
+                printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
 }
 int pSeries_system_reset_exception(struct pt_regs *regs)
@@ -259,31 +283,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 * Return 1 if corrected (or delivered a signal).
 * Return 0 if there is nothing we can do.
 */
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
 {
-        int nonfatal = 0;
+        int recovered = 0;
-        if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+        if (!(regs->msr & MSR_RI)) {
+                /* If MSR_RI isn't set, we cannot recover */
+                recovered = 0;
+        } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
                /* Platform corrected itself */
-                nonfatal = 1;
+                recovered = 1;
-        } else if ((regs->msr & MSR_RI) &&
-                   user_mode(regs) &&
+        } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
-                   err->severity == RTAS_SEVERITY_ERROR_SYNC &&
+                /* Platform corrected itself but could be degraded */
-                   err->disposition == RTAS_DISP_NOT_RECOVERED &&
+                printk(KERN_ERR "MCE: limited recovery, system may "
-                   err->target == RTAS_TARGET_MEMORY &&
+                       "be degraded\n");
-                   err->type == RTAS_TYPE_ECC_UNCORR &&
+                recovered = 1;
-                   !(current->pid == 0 || is_global_init(current))) {
-                /* Kill off a user process with an ECC error */
+        } else if (user_mode(regs) && !is_global_init(current) &&
-                printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
+                   err->severity == RTAS_SEVERITY_ERROR_SYNC) {
-                       current->pid);
-                /* XXX something better for ECC error? */
+                /*
-                _exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
+                 * If we received a synchronous error when in userspace
-                nonfatal = 1;
+                 * kill the task. Firmware may report details of the fail
+                 * asynchronously, so we can't rely on the target and type
+                 * fields being valid here.
+                 */
+                printk(KERN_ERR "MCE: uncorrectable error, killing task "
+                       "%s:%d\n", current->comm, current->pid);
+                _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+                recovered = 1;
        }
-        log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);
+        log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
-        return nonfatal;
+        return recovered;
 }
 /*

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 5d1b743dbe7e..5b3da4b4ea79 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -10,7 +10,7 @@ config PPC_PSERIES
10	select RTAS_ERROR_LOGGING	10	select RTAS_ERROR_LOGGING
11	select PPC_UDBG_16550	11	select PPC_UDBG_16550
12	select PPC_NATIVE	12	select PPC_NATIVE
13	select PPC_PCI_CHOICE if EMBEDDED	13	select PPC_PCI_CHOICE if EXPERT
14	default y	14	default y
15		15
16	config PPC_SPLPAR	16	config PPC_SPLPAR
@@ -24,9 +24,9 @@ config PPC_SPLPAR
24	two or more partitions.	24	two or more partitions.
25		25
26	config EEH	26	config EEH
27	bool "PCI Extended Error Handling (EEH)" if EMBEDDED	27	bool "PCI Extended Error Handling (EEH)" if EXPERT
28	depends on PPC_PSERIES && PCI	28	depends on PPC_PSERIES && PCI
29	default y if !EMBEDDED	29	default y if !EXPERT
30		30
31	config PSERIES_MSI	31	config PSERIES_MSI
32	bool	32	bool


diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 53cbd53d8740..77d38a5e2ff9 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c
@@ -61,13 +61,3 @@ void __init setup_kexec_cpu_down_xics(void)
61	{	61	{
62	ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;	62	ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
63	}	63	}
64
65	static int __init pseries_kexec_setup(void)
66	{
67	ppc_md.machine_kexec = default_machine_kexec;
68	ppc_md.machine_kexec_prepare = default_machine_kexec_prepare;
69	ppc_md.machine_crash_shutdown = default_machine_crash_shutdown;
70
71	return 0;
72	}
73	machine_device_initcall(pseries, pseries_kexec_setup);


diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 5d3ea9f60dd7..ca5d5898d320 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -713,6 +713,13 @@ EXPORT_SYMBOL(arch_free_page);
713	/* NB: reg/unreg are called while guarded with the tracepoints_mutex */	713	/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
714	extern long hcall_tracepoint_refcount;	714	extern long hcall_tracepoint_refcount;
715		715
		716	/*
		717	* Since the tracing code might execute hcalls we need to guard against
		718	* recursion. One example of this are spinlocks calling H_YIELD on
		719	* shared processor partitions.
		720	*/
		721	static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
		722
716	void hcall_tracepoint_regfunc(void)	723	void hcall_tracepoint_regfunc(void)
717	{	724	{
718	hcall_tracepoint_refcount++;	725	hcall_tracepoint_refcount++;
@@ -725,12 +732,42 @@ void hcall_tracepoint_unregfunc(void)
725		732
726	void __trace_hcall_entry(unsigned long opcode, unsigned long *args)	733	void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
727	{	734	{
		735	unsigned long flags;
		736	unsigned int *depth;
		737
		738	local_irq_save(flags);
		739
		740	depth = &__get_cpu_var(hcall_trace_depth);
		741
		742	if (*depth)
		743	goto out;
		744
		745	(*depth)++;
728	trace_hcall_entry(opcode, args);	746	trace_hcall_entry(opcode, args);
		747	(*depth)--;
		748
		749	out:
		750	local_irq_restore(flags);
729	}	751	}
730		752
731	void __trace_hcall_exit(long opcode, unsigned long retval,	753	void __trace_hcall_exit(long opcode, unsigned long retval,
732	unsigned long *retbuf)	754	unsigned long *retbuf)
733	{	755	{
		756	unsigned long flags;
		757	unsigned int *depth;
		758
		759	local_irq_save(flags);
		760
		761	depth = &__get_cpu_var(hcall_trace_depth);
		762
		763	if (*depth)
		764	goto out;
		765
		766	(*depth)++;
734	trace_hcall_exit(opcode, retval, retbuf);	767	trace_hcall_exit(opcode, retval, retbuf);
		768	(*depth)--;
		769
		770	out:
		771	local_irq_restore(flags);
735	}	772	}
736	#endif	773	#endif


diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index a4fc6da87c2e..c55d7ad9c648 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c
@@ -54,7 +54,8 @@
54	static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];	54	static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
55	static DEFINE_SPINLOCK(ras_log_buf_lock);	55	static DEFINE_SPINLOCK(ras_log_buf_lock);
56		56
57	static char mce_data_buf[RTAS_ERROR_LOG_MAX];	57	static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
		58	static DEFINE_PER_CPU(__u64, mce_data_buf);
58		59
59	static int ras_get_sensor_state_token;	60	static int ras_get_sensor_state_token;
60	static int ras_check_exception_token;	61	static int ras_check_exception_token;
@@ -196,12 +197,24 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
196	return IRQ_HANDLED;	197	return IRQ_HANDLED;
197	}	198	}
198		199
199	/* Get the error information for errors coming through the	200	/*
		201	* Some versions of FWNMI place the buffer inside the 4kB page starting at
		202	* 0x7000. Other versions place it inside the rtas buffer. We check both.
		203	*/
		204	#define VALID_FWNMI_BUFFER(A) \
		205	((((A) >= 0x7000) && ((A) < 0x7ff0)) \|\| \
		206	(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
		207
		208	/*
		209	* Get the error information for errors coming through the
200	* FWNMI vectors. The pt_regs' r3 will be updated to reflect	210	* FWNMI vectors. The pt_regs' r3 will be updated to reflect
201	* the actual r3 if possible, and a ptr to the error log entry	211	* the actual r3 if possible, and a ptr to the error log entry
202	* will be returned if found.	212	* will be returned if found.
203	*	213	*
204	* The mce_data_buf does not have any locks or protection around it,	214	* If the RTAS error is not of the extended type, then we put it in a per
		215	* cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
		216	*
		217	* The global_mce_data_buf does not have any locks or protection around it,
205	* if a second machine check comes in, or a system reset is done	218	* if a second machine check comes in, or a system reset is done
206	* before we have logged the error, then we will get corruption in the	219	* before we have logged the error, then we will get corruption in the
207	* error log. This is preferable over holding off on calling	220	* error log. This is preferable over holding off on calling
@@ -210,20 +223,31 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
210	*/	223	*/
211	static struct rtas_error_log fwnmi_get_errinfo(struct pt_regs regs)	224	static struct rtas_error_log fwnmi_get_errinfo(struct pt_regs regs)
212	{	225	{
213	unsigned long errdata = regs->gpr[3];
214	struct rtas_error_log *errhdr = NULL;
215	unsigned long *savep;	226	unsigned long *savep;
		227	struct rtas_error_log h, errhdr = NULL;
		228
		229	if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
		230	printk(KERN_ERR "FWNMI: corrupt r3\n");
		231	return NULL;
		232	}
216		233
217	if ((errdata >= 0x7000 && errdata < 0x7fff0) \|\|	234	savep = __va(regs->gpr[3]);
218	(errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {	235	regs->gpr[3] = savep[0]; /* restore original r3 */
219	savep = __va(errdata);	236
220	regs->gpr[3] = savep[0]; /* restore original r3 */	237	/* If it isn't an extended log we can use the per cpu 64bit buffer */
221	memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);	238	h = (struct rtas_error_log *)&savep[1];
222	memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);	239	if (!h->extended) {
223	errhdr = (struct rtas_error_log *)mce_data_buf;	240	memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
		241	errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
224	} else {	242	} else {
225	printk("FWNMI: corrupt r3\n");	243	int len;
		244
		245	len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX);
		246	memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
		247	memcpy(global_mce_data_buf, h, len);
		248	errhdr = (struct rtas_error_log *)global_mce_data_buf;
226	}	249	}
		250
227	return errhdr;	251	return errhdr;
228	}	252	}
229		253
@@ -235,7 +259,7 @@ static void fwnmi_release_errinfo(void)
235	{	259	{
236	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);	260	int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
237	if (ret != 0)	261	if (ret != 0)
238	printk("FWNMI: nmi-interlock failed: %d\n", ret);	262	printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
239	}	263	}
240		264
241	int pSeries_system_reset_exception(struct pt_regs *regs)	265	int pSeries_system_reset_exception(struct pt_regs *regs)
@@ -259,31 +283,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
259	* Return 1 if corrected (or delivered a signal).	283	* Return 1 if corrected (or delivered a signal).
260	* Return 0 if there is nothing we can do.	284	* Return 0 if there is nothing we can do.
261	*/	285	*/
262	static int recover_mce(struct pt_regs regs, struct rtas_error_log err)	286	static int recover_mce(struct pt_regs regs, struct rtas_error_log err)
263	{	287	{
264	int nonfatal = 0;	288	int recovered = 0;
265		289
266	if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {	290	if (!(regs->msr & MSR_RI)) {
		291	/* If MSR_RI isn't set, we cannot recover */
		292	recovered = 0;
		293
		294	} else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
267	/* Platform corrected itself */	295	/* Platform corrected itself */
268	nonfatal = 1;	296	recovered = 1;
269	} else if ((regs->msr & MSR_RI) &&	297
270	user_mode(regs) &&	298	} else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
271	err->severity == RTAS_SEVERITY_ERROR_SYNC &&	299	/* Platform corrected itself but could be degraded */
272	err->disposition == RTAS_DISP_NOT_RECOVERED &&	300	printk(KERN_ERR "MCE: limited recovery, system may "
273	err->target == RTAS_TARGET_MEMORY &&	301	"be degraded\n");
274	err->type == RTAS_TYPE_ECC_UNCORR &&	302	recovered = 1;
275	!(current->pid == 0 \|\| is_global_init(current))) {	303
276	/* Kill off a user process with an ECC error */	304	} else if (user_mode(regs) && !is_global_init(current) &&
277	printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",	305	err->severity == RTAS_SEVERITY_ERROR_SYNC) {
278	current->pid);	306
279	/* XXX something better for ECC error? */	307	/*
280	_exception(SIGBUS, regs, BUS_ADRERR, regs->nip);	308	* If we received a synchronous error when in userspace
281	nonfatal = 1;	309	* kill the task. Firmware may report details of the fail
		310	* asynchronously, so we can't rely on the target and type
		311	* fields being valid here.
		312	*/
		313	printk(KERN_ERR "MCE: uncorrectable error, killing task "
		314	"%s:%d\n", current->comm, current->pid);
		315
		316	_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
		317	recovered = 1;
282	}	318	}
283		319
284	log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);	320	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
285		321
286	return nonfatal;	322	return recovered;
287	}	323	}
288		324
289	/*	325	/*