[SPARC64]: Fix bugs in SUN4V cpu mondo dispatch.

There were several bugs in the SUN4V cpu mondo dispatch code. In fact, if we ever got a EWOULDBLOCK or other error from the hypervisor call, we'd potentially send a cpu mondo multiple times to the same cpu and even worse we could loop until the timeout resending the same mondo over and over to such cpus. So let's bulletproof this thing as follows: 1) Implement cpu_mondo_send() and cpu_state() hypervisor calls in arch/sparc64/kernel/entry.S, add prototypes to asm/hypervisor.h 2) Don't build and update the cpulist using inline functions, this was causing the cpu mask to not get updated in the caller. 3) Disable interrupts during the entire mondo send, otherwise our cpu list and/or mondo block could get overwritten if we take an interrupt and do a cpu mondo send on the current cpu. 4) Check for all possible error return types from the cpu_mondo_send() hypervisor call. In particular: HV_EOK) Our work is done, all cpus have received the mondo. HV_CPUERROR) One or more of the cpus in the cpu list we passed to the hypervisor are in error state. Use cpu_state() calls over the entries in the cpu list to see which ones. Record them in "error_mask" and report this after we are done sending the mondo to cpus which are not in error state. HV_EWOULDBLOCK) We need to keep trying. Any other error we consider fatal, we report the event and exit immediately. 5) We only timeout if forward progress is not made. Forward progress is defined as having at least one cpu get the mondo successfully in a given cpu_mondo_send() call. Otherwise we bump a counter and delay a little. If the counter hits a limit, we signal an error and report the event. Also, smp_call_function_mask() error handling reports the number of cpus incorrectly. Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2006-02-28 18:10:26 -0500
committer: David S. Miller <davem@sunset.davemloft.net> 2006-03-20 04:14:09 -0500
commit: b830ab665ad96c6b20d51a89b35cbc09ab5a2c29 (patch)
tree: 57c2c75b3e069f9f244259ae02f6f2fe3de68612 /arch/sparc64
parent: aac0aadf09b98ba36eab0bb02a560ebcb82ac39f (diff)
2 files changed, 151 insertions, 57 deletions
diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
index 9f3048e64e84..6d0b3ed77a02 100644
--- a/arch/sparc64/kernel/entry.S
+++ b/arch/sparc64/kernel/entry.S
@@ -1795,3 +1795,31 @@ sun4v_cpu_yield:
        ta      HV_FAST_TRAP
        retl
         nop
+        /* %o0: num cpus in cpu list
+         * %o1: cpu list paddr
+         * %o2: mondo block paddr
+         *
+         * returns %o0: status
+         */
+        .globl  sun4v_cpu_mondo_send
+sun4v_cpu_mondo_send:
+        mov     HV_FAST_CPU_MONDO_SEND, %o5
+        ta      HV_FAST_TRAP
+        retl
+         nop
+        /* %o0: CPU ID
+         *
+         * returns %o0: -status if status non-zero, else
+         *         %o0: cpu state as HV_CPU_STATE_*
+         */
+        .globl  sun4v_cpu_state
+sun4v_cpu_state:
+        mov     HV_FAST_CPU_STATE, %o5
+        ta      HV_FAST_TRAP
+        brnz,pn %o0, 1f
+         sub    %g0, %o0, %o0
+        mov     %o1, %o0
+1:      retl
+         nop
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index eb7c0f855ba7..6bc7fd47e443 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -556,77 +556,144 @@ retry:
 }
 /* Multi-cpu list version.  */
-static int init_cpu_list(u16 *list, cpumask_t mask)
-{
-        int i, cnt;
-        cnt = 0;
-        for_each_cpu_mask(i, mask)
-                list[cnt++] = i;
-        return cnt;
-}
-static int update_cpu_list(u16 *list, int orig_cnt, cpumask_t mask)
-{
-        int i;
-        for (i = 0; i < orig_cnt; i++) {
-                if (list[i] == 0xffff)
-                        cpu_clear(i, mask);
-        }
-        return init_cpu_list(list, mask);
-}
 static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
 {
-        int this_cpu = get_cpu();
+        struct trap_per_cpu *tb;
-        struct trap_per_cpu *tb = &trap_block[this_cpu];
+        u16 *cpu_list;
-        u64 *mondo = __va(tb->cpu_mondo_block_pa);
+        u64 *mondo;
-        u16 *cpu_list = __va(tb->cpu_list_pa);
+        cpumask_t error_mask;
-        int cnt, retries;
+        unsigned long flags, status;
+        int cnt, retries, this_cpu, i;
+        /* We have to do this whole thing with interrupts fully disabled.
+         * Otherwise if we send an xcall from interrupt context it will
+         * corrupt both our mondo block and cpu list state.
+         *
+         * One consequence of this is that we cannot use timeout mechanisms
+         * that depend upon interrupts being delivered locally.  So, for
+         * example, we cannot sample jiffies and expect it to advance.
+         *
+         * Fortunately, udelay() uses %stick/%tick so we can use that.
+         */
+        local_irq_save(flags);
+        this_cpu = smp_processor_id();
+        tb = &trap_block[this_cpu];
+        mondo = __va(tb->cpu_mondo_block_pa);
        mondo[0] = data0;
        mondo[1] = data1;
        mondo[2] = data2;
        wmb();
+        cpu_list = __va(tb->cpu_list_pa);
+        /* Setup the initial cpu list.  */
+        cnt = 0;
+        for_each_cpu_mask(i, mask)
+                cpu_list[cnt++] = i;
+        cpus_clear(error_mask);
        retries = 0;
-        cnt = init_cpu_list(cpu_list, mask);
        do {
-                register unsigned long func __asm__("%o5");
+                int forward_progress;
-                register unsigned long arg0 __asm__("%o0");
-                register unsigned long arg1 __asm__("%o1");
+                status = sun4v_cpu_mondo_send(cnt,
-                register unsigned long arg2 __asm__("%o2");
+                                              tb->cpu_list_pa,
+                                              tb->cpu_mondo_block_pa);
-                func = HV_FAST_CPU_MONDO_SEND;
-                arg0 = cnt;
-                arg1 = tb->cpu_list_pa;
-                arg2 = tb->cpu_mondo_block_pa;
-                __asm__ __volatile__("ta        %8"
-                                     : "=&r" (func), "=&r" (arg0),
-                                       "=&r" (arg1), "=&r" (arg2)
-                                     : "0" (func), "1" (arg0),
-                                       "2" (arg1), "3" (arg2),
-                                       "i" (HV_FAST_TRAP)
-                                     : "memory");
-                if (likely(arg0 == HV_EOK))
-                        break;
-                if (unlikely(++retries > 100)) {
+                /* HV_EOK means all cpus received the xcall, we're done.  */
-                        printk("CPU[%d]: sun4v mondo error %lu\n",
+                if (likely(status == HV_EOK))
-                               this_cpu, arg0);
                        break;
+                /* First, clear out all the cpus in the mask that were
+                 * successfully sent to.  The hypervisor indicates this
+                 * by setting the cpu list entry of such cpus to 0xffff.
+                 */
+                forward_progress = 0;
+                for (i = 0; i < cnt; i++) {
+                        if (cpu_list[i] == 0xffff) {
+                                cpu_clear(i, mask);
+                                forward_progress = 1;
+                        }
                }
-                cnt = update_cpu_list(cpu_list, cnt, mask);
+                /* If we get a HV_ECPUERROR, then one or more of the cpus
+                 * in the list are in error state.  Use the cpu_state()
+                 * hypervisor call to find out which cpus are in error state.
+                 */
+                if (unlikely(status == HV_ECPUERROR)) {
+                        for (i = 0; i < cnt; i++) {
+                                long err;
+                                u16 cpu;
+                                cpu = cpu_list[i];
+                                if (cpu == 0xffff)
+                                        continue;
+                                err = sun4v_cpu_state(cpu);
+                                if (err >= 0 &&
+                                    err == HV_CPU_STATE_ERROR) {
+                                        cpu_clear(cpu, mask);
+                                        cpu_set(cpu, error_mask);
+                                }
+                        }
+                } else if (unlikely(status != HV_EWOULDBLOCK))
+                        goto fatal_mondo_error;
+                /* Rebuild the cpu_list[] array and try again.  */
+                cnt = 0;
+                for_each_cpu_mask(i, mask)
+                        cpu_list[cnt++] = i;
-                udelay(2 * cnt);
+                if (unlikely(!forward_progress)) {
+                        if (unlikely(++retries > 10000))
+                                goto fatal_mondo_timeout;
+                        /* Delay a little bit to let other cpus catch up
+                         * on their cpu mondo queue work.
+                         */
+                        udelay(2 * cnt);
+                }
        } while (1);
-        put_cpu();
+        local_irq_restore(flags);
+        if (unlikely(!cpus_empty(error_mask)))
+                goto fatal_mondo_cpu_error;
+        return;
+fatal_mondo_cpu_error:
+        printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
+               "were in error state\n",
+               this_cpu);
+        printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
+        for_each_cpu_mask(i, error_mask)
+                printk("%d ", i);
+        printk("]\n");
+        return;
+fatal_mondo_timeout:
+        local_irq_restore(flags);
+        printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
+               " progress after %d retries.\n",
+               this_cpu, retries);
+        goto dump_cpu_list_and_out;
+fatal_mondo_error:
+        local_irq_restore(flags);
+        printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
+               this_cpu, status);
+        printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
+               "mondo_block_pa(%lx)\n",
+               this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
+dump_cpu_list_and_out:
+        printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
+        for (i = 0; i < cnt; i++)
+                printk("%u ", cpu_list[i]);
+        printk("]\n");
 }
 /* Send cross call to all processors mentioned in MASK
@@ -723,9 +790,8 @@ static int smp_call_function_mask(void (*func)(void *info), void *info,
 out_timeout:
        spin_unlock(&call_lock);
-        printk("XCALL: Remote cpus not responding, ncpus=%ld finished=%ld\n",
+        printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n",
-               (long) num_online_cpus() - 1L,
+               cpus, atomic_read(&data.finished));
-               (long) atomic_read(&data.finished));
        return 0;
 }
author	David S. Miller <davem@davemloft.net>	2006-02-28 18:10:26 -0500
committer	David S. Miller <davem@sunset.davemloft.net>	2006-03-20 04:14:09 -0500
commit	b830ab665ad96c6b20d51a89b35cbc09ab5a2c29 (patch)
tree	57c2c75b3e069f9f244259ae02f6f2fe3de68612 /arch/sparc64
parent	aac0aadf09b98ba36eab0bb02a560ebcb82ac39f (diff)

diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index 9f3048e64e84..6d0b3ed77a02 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S
@@ -1795,3 +1795,31 @@ sun4v_cpu_yield:
1795	ta HV_FAST_TRAP	1795	ta HV_FAST_TRAP
1796	retl	1796	retl
1797	nop	1797	nop
		1798
		1799	/* %o0: num cpus in cpu list
		1800	* %o1: cpu list paddr
		1801	* %o2: mondo block paddr
		1802	*
		1803	* returns %o0: status
		1804	*/
		1805	.globl sun4v_cpu_mondo_send
		1806	sun4v_cpu_mondo_send:
		1807	mov HV_FAST_CPU_MONDO_SEND, %o5
		1808	ta HV_FAST_TRAP
		1809	retl
		1810	nop
		1811
		1812	/* %o0: CPU ID
		1813	*
		1814	* returns %o0: -status if status non-zero, else
		1815	* %o0: cpu state as HV_CPU_STATE_*
		1816	*/
		1817	.globl sun4v_cpu_state
		1818	sun4v_cpu_state:
		1819	mov HV_FAST_CPU_STATE, %o5
		1820	ta HV_FAST_TRAP
		1821	brnz,pn %o0, 1f
		1822	sub %g0, %o0, %o0
		1823	mov %o1, %o0
		1824	1: retl
		1825	nop


diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index eb7c0f855ba7..6bc7fd47e443 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c
@@ -556,77 +556,144 @@ retry:
556	}	556	}
557		557
558	/* Multi-cpu list version. */	558	/* Multi-cpu list version. */
559	static int init_cpu_list(u16 *list, cpumask_t mask)
560	{
561	int i, cnt;
562
563	cnt = 0;
564	for_each_cpu_mask(i, mask)
565	list[cnt++] = i;
566
567	return cnt;
568	}
569
570	static int update_cpu_list(u16 *list, int orig_cnt, cpumask_t mask)
571	{
572	int i;
573
574	for (i = 0; i < orig_cnt; i++) {
575	if (list[i] == 0xffff)
576	cpu_clear(i, mask);
577	}
578
579	return init_cpu_list(list, mask);
580	}
581
582	static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)	559	static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
583	{	560	{
584	int this_cpu = get_cpu();	561	struct trap_per_cpu *tb;
585	struct trap_per_cpu *tb = &trap_block[this_cpu];	562	u16 *cpu_list;
586	u64 *mondo = __va(tb->cpu_mondo_block_pa);	563	u64 *mondo;
587	u16 *cpu_list = __va(tb->cpu_list_pa);	564	cpumask_t error_mask;
588	int cnt, retries;	565	unsigned long flags, status;
		566	int cnt, retries, this_cpu, i;
		567
		568	/* We have to do this whole thing with interrupts fully disabled.
		569	* Otherwise if we send an xcall from interrupt context it will
		570	* corrupt both our mondo block and cpu list state.
		571	*
		572	* One consequence of this is that we cannot use timeout mechanisms
		573	* that depend upon interrupts being delivered locally. So, for
		574	* example, we cannot sample jiffies and expect it to advance.
		575	*
		576	* Fortunately, udelay() uses %stick/%tick so we can use that.
		577	*/
		578	local_irq_save(flags);
		579
		580	this_cpu = smp_processor_id();
		581	tb = &trap_block[this_cpu];
589		582
		583	mondo = __va(tb->cpu_mondo_block_pa);
590	mondo[0] = data0;	584	mondo[0] = data0;
591	mondo[1] = data1;	585	mondo[1] = data1;
592	mondo[2] = data2;	586	mondo[2] = data2;
593	wmb();	587	wmb();
594		588
		589	cpu_list = __va(tb->cpu_list_pa);
		590
		591	/* Setup the initial cpu list. */
		592	cnt = 0;
		593	for_each_cpu_mask(i, mask)
		594	cpu_list[cnt++] = i;
		595
		596	cpus_clear(error_mask);
595	retries = 0;	597	retries = 0;
596	cnt = init_cpu_list(cpu_list, mask);
597	do {	598	do {
598	register unsigned long func __asm__("%o5");	599	int forward_progress;
599	register unsigned long arg0 __asm__("%o0");	600
600	register unsigned long arg1 __asm__("%o1");	601	status = sun4v_cpu_mondo_send(cnt,
601	register unsigned long arg2 __asm__("%o2");	602	tb->cpu_list_pa,
602		603	tb->cpu_mondo_block_pa);
603	func = HV_FAST_CPU_MONDO_SEND;
604	arg0 = cnt;
605	arg1 = tb->cpu_list_pa;
606	arg2 = tb->cpu_mondo_block_pa;
607
608	__asm__ __volatile__("ta %8"
609	: "=&r" (func), "=&r" (arg0),
610	"=&r" (arg1), "=&r" (arg2)
611	: "0" (func), "1" (arg0),
612	"2" (arg1), "3" (arg2),
613	"i" (HV_FAST_TRAP)
614	: "memory");
615	if (likely(arg0 == HV_EOK))
616	break;
617		604
618	if (unlikely(++retries > 100)) {	605	/* HV_EOK means all cpus received the xcall, we're done. */
619	printk("CPU[%d]: sun4v mondo error %lu\n",	606	if (likely(status == HV_EOK))
620	this_cpu, arg0);
621	break;	607	break;
		608
		609	/* First, clear out all the cpus in the mask that were
		610	* successfully sent to. The hypervisor indicates this
		611	* by setting the cpu list entry of such cpus to 0xffff.
		612	*/
		613	forward_progress = 0;
		614	for (i = 0; i < cnt; i++) {
		615	if (cpu_list[i] == 0xffff) {
		616	cpu_clear(i, mask);
		617	forward_progress = 1;
		618	}
622	}	619	}
623		620
624	cnt = update_cpu_list(cpu_list, cnt, mask);	621	/* If we get a HV_ECPUERROR, then one or more of the cpus
		622	* in the list are in error state. Use the cpu_state()
		623	* hypervisor call to find out which cpus are in error state.
		624	*/
		625	if (unlikely(status == HV_ECPUERROR)) {
		626	for (i = 0; i < cnt; i++) {
		627	long err;
		628	u16 cpu;
		629
		630	cpu = cpu_list[i];
		631	if (cpu == 0xffff)
		632	continue;
		633
		634	err = sun4v_cpu_state(cpu);
		635	if (err >= 0 &&
		636	err == HV_CPU_STATE_ERROR) {
		637	cpu_clear(cpu, mask);
		638	cpu_set(cpu, error_mask);
		639	}
		640	}
		641	} else if (unlikely(status != HV_EWOULDBLOCK))
		642	goto fatal_mondo_error;
		643
		644	/* Rebuild the cpu_list[] array and try again. */
		645	cnt = 0;
		646	for_each_cpu_mask(i, mask)
		647	cpu_list[cnt++] = i;
625		648
626	udelay(2 * cnt);	649	if (unlikely(!forward_progress)) {
		650	if (unlikely(++retries > 10000))
		651	goto fatal_mondo_timeout;
		652
		653	/* Delay a little bit to let other cpus catch up
		654	* on their cpu mondo queue work.
		655	*/
		656	udelay(2 * cnt);
		657	}
627	} while (1);	658	} while (1);
628		659
629	put_cpu();	660	local_irq_restore(flags);
		661
		662	if (unlikely(!cpus_empty(error_mask)))
		663	goto fatal_mondo_cpu_error;
		664
		665	return;
		666
		667	fatal_mondo_cpu_error:
		668	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
		669	"were in error state\n",
		670	this_cpu);
		671	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
		672	for_each_cpu_mask(i, error_mask)
		673	printk("%d ", i);
		674	printk("]\n");
		675	return;
		676
		677	fatal_mondo_timeout:
		678	local_irq_restore(flags);
		679	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
		680	" progress after %d retries.\n",
		681	this_cpu, retries);
		682	goto dump_cpu_list_and_out;
		683
		684	fatal_mondo_error:
		685	local_irq_restore(flags);
		686	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
		687	this_cpu, status);
		688	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
		689	"mondo_block_pa(%lx)\n",
		690	this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
		691
		692	dump_cpu_list_and_out:
		693	printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
		694	for (i = 0; i < cnt; i++)
		695	printk("%u ", cpu_list[i]);
		696	printk("]\n");
630	}	697	}
631		698
632	/* Send cross call to all processors mentioned in MASK	699	/* Send cross call to all processors mentioned in MASK
@@ -723,9 +790,8 @@ static int smp_call_function_mask(void (func)(void info), void *info,
723		790
724	out_timeout:	791	out_timeout:
725	spin_unlock(&call_lock);	792	spin_unlock(&call_lock);
726	printk("XCALL: Remote cpus not responding, ncpus=%ld finished=%ld\n",	793	printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n",
727	(long) num_online_cpus() - 1L,	794	cpus, atomic_read(&data.finished));
728	(long) atomic_read(&data.finished));
729	return 0;	795	return 0;
730	}	796	}
731		797