[SPARC64]: More SUN4V cpu mondo bug fixing.

This cpu mondo sending interface isn't all that easy to use correctly... We were clearing out the wrong bits from the "mask" after getting something other than EOK from the hypervisor. It turns out the hypervisor can just be resent the same cpu_list[] array, with the 0xffff "done" entries still in there, and it will do the right thing. So don't update or try to rebuild the cpu_list[] array to condense it. This requires the "forward_progress" check to be done slightly differently, but this new scheme is less bug prone than what we were doing before. Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@sunset.davemloft.net> 2006-03-03 00:50:47 -0500
committer: David S. Miller <davem@sunset.davemloft.net> 2006-03-20 04:14:17 -0500
commit: 3cab0c3e8636d5005041aa52224f796c3a4ef872 (patch)
tree: 582c92940f46cb0ecf8fafd4fde1cfd346172366 /arch/sparc64/kernel/smp.c
parent: bcc28ee0bf390df0d81cc9dafe980faef6b2771a (diff)
1 files changed, 24 insertions, 16 deletions
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 6bc7fd47e443..c4548a88953c 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -563,7 +563,7 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
        u64 *mondo;
        cpumask_t error_mask;
        unsigned long flags, status;
-        int cnt, retries, this_cpu, i;
+        int cnt, retries, this_cpu, prev_sent, i;
        /* We have to do this whole thing with interrupts fully disabled.
         * Otherwise if we send an xcall from interrupt context it will
@@ -595,8 +595,9 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
        cpus_clear(error_mask);
        retries = 0;
+        prev_sent = 0;
        do {
-                int forward_progress;
+                int forward_progress, n_sent;
                status = sun4v_cpu_mondo_send(cnt,
                                              tb->cpu_list_pa,
@@ -606,18 +607,23 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
                if (likely(status == HV_EOK))
                        break;
-                /* First, clear out all the cpus in the mask that were
+                /* First, see if we made any forward progress.
-                 * successfully sent to.  The hypervisor indicates this
+                 *
-                 * by setting the cpu list entry of such cpus to 0xffff.
+                 * The hypervisor indicates successful sends by setting
+                 * cpu list entries to the value 0xffff.
                 */
-                forward_progress = 0;
+                n_sent = 0;
                for (i = 0; i < cnt; i++) {
-                        if (cpu_list[i] == 0xffff) {
+                        if (likely(cpu_list[i] == 0xffff))
-                                cpu_clear(i, mask);
+                                n_sent++;
-                                forward_progress = 1;
-                        }
                }
+                forward_progress = 0;
+                if (n_sent > prev_sent)
+                        forward_progress = 1;
+                prev_sent = n_sent;
                /* If we get a HV_ECPUERROR, then one or more of the cpus
                 * in the list are in error state.  Use the cpu_state()
                 * hypervisor call to find out which cpus are in error state.
@@ -634,18 +640,20 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
                                err = sun4v_cpu_state(cpu);
                                if (err >= 0 &&
                                    err == HV_CPU_STATE_ERROR) {
-                                        cpu_clear(cpu, mask);
+                                        cpu_list[i] = 0xffff;
                                        cpu_set(cpu, error_mask);
                                }
                        }
                } else if (unlikely(status != HV_EWOULDBLOCK))
                        goto fatal_mondo_error;
-                /* Rebuild the cpu_list[] array and try again.  */
+                /* Don't bother rewriting the CPU list, just leave the
-                cnt = 0;
+                 * 0xffff and non-0xffff entries in there and the
-                for_each_cpu_mask(i, mask)
+                 * hypervisor will do the right thing.
-                        cpu_list[cnt++] = i;
+                 *
+                 * Only advance timeout state if we didn't make any
+                 * forward progress.
+                 */
                if (unlikely(!forward_progress)) {
                        if (unlikely(++retries > 10000))
                                goto fatal_mondo_timeout;
author	David S. Miller <davem@sunset.davemloft.net>	2006-03-03 00:50:47 -0500
committer	David S. Miller <davem@sunset.davemloft.net>	2006-03-20 04:14:17 -0500
commit	3cab0c3e8636d5005041aa52224f796c3a4ef872 (patch)
tree	582c92940f46cb0ecf8fafd4fde1cfd346172366 /arch/sparc64/kernel/smp.c
parent	bcc28ee0bf390df0d81cc9dafe980faef6b2771a (diff)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 6bc7fd47e443..c4548a88953c 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c
@@ -563,7 +563,7 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
563	u64 *mondo;	563	u64 *mondo;
564	cpumask_t error_mask;	564	cpumask_t error_mask;
565	unsigned long flags, status;	565	unsigned long flags, status;
566	int cnt, retries, this_cpu, i;	566	int cnt, retries, this_cpu, prev_sent, i;
567		567
568	/* We have to do this whole thing with interrupts fully disabled.	568	/* We have to do this whole thing with interrupts fully disabled.
569	* Otherwise if we send an xcall from interrupt context it will	569	* Otherwise if we send an xcall from interrupt context it will
@@ -595,8 +595,9 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
595		595
596	cpus_clear(error_mask);	596	cpus_clear(error_mask);
597	retries = 0;	597	retries = 0;
		598	prev_sent = 0;
598	do {	599	do {
599	int forward_progress;	600	int forward_progress, n_sent;
600		601
601	status = sun4v_cpu_mondo_send(cnt,	602	status = sun4v_cpu_mondo_send(cnt,
602	tb->cpu_list_pa,	603	tb->cpu_list_pa,
@@ -606,18 +607,23 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
606	if (likely(status == HV_EOK))	607	if (likely(status == HV_EOK))
607	break;	608	break;
608		609
609	/* First, clear out all the cpus in the mask that were	610	/* First, see if we made any forward progress.
610	* successfully sent to. The hypervisor indicates this	611	*
611	* by setting the cpu list entry of such cpus to 0xffff.	612	* The hypervisor indicates successful sends by setting
		613	* cpu list entries to the value 0xffff.
612	*/	614	*/
613	forward_progress = 0;	615	n_sent = 0;
614	for (i = 0; i < cnt; i++) {	616	for (i = 0; i < cnt; i++) {
615	if (cpu_list[i] == 0xffff) {	617	if (likely(cpu_list[i] == 0xffff))
616	cpu_clear(i, mask);	618	n_sent++;
617	forward_progress = 1;
618	}
619	}	619	}
620		620
		621	forward_progress = 0;
		622	if (n_sent > prev_sent)
		623	forward_progress = 1;
		624
		625	prev_sent = n_sent;
		626
621	/* If we get a HV_ECPUERROR, then one or more of the cpus	627	/* If we get a HV_ECPUERROR, then one or more of the cpus
622	* in the list are in error state. Use the cpu_state()	628	* in the list are in error state. Use the cpu_state()
623	* hypervisor call to find out which cpus are in error state.	629	* hypervisor call to find out which cpus are in error state.
@@ -634,18 +640,20 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
634	err = sun4v_cpu_state(cpu);	640	err = sun4v_cpu_state(cpu);
635	if (err >= 0 &&	641	if (err >= 0 &&
636	err == HV_CPU_STATE_ERROR) {	642	err == HV_CPU_STATE_ERROR) {
637	cpu_clear(cpu, mask);	643	cpu_list[i] = 0xffff;
638	cpu_set(cpu, error_mask);	644	cpu_set(cpu, error_mask);
639	}	645	}
640	}	646	}
641	} else if (unlikely(status != HV_EWOULDBLOCK))	647	} else if (unlikely(status != HV_EWOULDBLOCK))
642	goto fatal_mondo_error;	648	goto fatal_mondo_error;
643		649
644	/* Rebuild the cpu_list[] array and try again. */	650	/* Don't bother rewriting the CPU list, just leave the
645	cnt = 0;	651	* 0xffff and non-0xffff entries in there and the
646	for_each_cpu_mask(i, mask)	652	* hypervisor will do the right thing.
647	cpu_list[cnt++] = i;	653	*
648		654	* Only advance timeout state if we didn't make any
		655	* forward progress.
		656	*/
649	if (unlikely(!forward_progress)) {	657	if (unlikely(!forward_progress)) {
650	if (unlikely(++retries > 10000))	658	if (unlikely(++retries > 10000))
651	goto fatal_mondo_timeout;	659	goto fatal_mondo_timeout;