MIPS: Outline udelay and fix a few issues.

Outlining fixes the issue were on certain CPUs such as the R10000 family the delay loop would need an extra cycle if it overlaps a cacheline boundary. The rewrite also fixes build errors with GCC 4.4 which was changed in way incompatible with the kernel's inline assembly. Relying on pure C for computation of the delay value removes the need for explicit. The price we pay is a slight slowdown of the computation - to be fixed on another day. Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
author: Ralf Baechle <ralf@linux-mips.org> 2009-02-28 04:44:28 -0500
committer: Ralf Baechle <ralf@linux-mips.org> 2009-06-08 11:57:51 -0400
commit: 5636919b5c909fee54a6ef5226475ecae012ad02 (patch)
tree: c77fa89c56ee2d493fb82117ab5dbc5b28a8deeb
parent: 3a553147eaad5d4de90ab1f695aa13ddbea684ec (diff)
5 files changed, 66 insertions, 92 deletions
diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index 744cd8fb107f..126044308dec 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -39,8 +39,8 @@ struct cache_desc {
 #define MIPS_CACHE_PINDEX       0x00000020      /* Physically indexed cache */
 struct cpuinfo_mips {
-        unsigned long           udelay_val;
+        unsigned int            udelay_val;
-        unsigned long           asid_cache;
+        unsigned int            asid_cache;
        /*
         * Capability and feature descriptor structure for MIPS CPU
diff --git a/arch/mips/include/asm/delay.h b/arch/mips/include/asm/delay.h
index b0bccd2c4ed5..a07e51b2be13 100644
--- a/arch/mips/include/asm/delay.h
+++ b/arch/mips/include/asm/delay.h
@@ -11,94 +11,12 @@
 #ifndef _ASM_DELAY_H
 #define _ASM_DELAY_H
-#include <linux/param.h>
+extern void __delay(unsigned int loops);
-#include <linux/smp.h>
+extern void __ndelay(unsigned int ns);
+extern void __udelay(unsigned int us);
-#include <asm/compiler.h>
+#define ndelay(ns) __udelay(ns)
-#include <asm/war.h>
+#define udelay(us) __udelay(us)
-static inline void __delay(unsigned long loops)
-{
-        if (sizeof(long) == 4)
-                __asm__ __volatile__ (
-                "       .set    noreorder                               \n"
-                "       .align  3                                       \n"
-                "1:     bnez    %0, 1b                                  \n"
-                "       subu    %0, 1                                   \n"
-                "       .set    reorder                                 \n"
-                : "=r" (loops)
-                : "0" (loops));
-        else if (sizeof(long) == 8 && !DADDI_WAR)
-                __asm__ __volatile__ (
-                "       .set    noreorder                               \n"
-                "       .align  3                                       \n"
-                "1:     bnez    %0, 1b                                  \n"
-                "       dsubu   %0, 1                                   \n"
-                "       .set    reorder                                 \n"
-                : "=r" (loops)
-                : "0" (loops));
-        else if (sizeof(long) == 8 && DADDI_WAR)
-                __asm__ __volatile__ (
-                "       .set    noreorder                               \n"
-                "       .align  3                                       \n"
-                "1:     bnez    %0, 1b                                  \n"
-                "       dsubu   %0, %2                                  \n"
-                "       .set    reorder                                 \n"
-                : "=r" (loops)
-                : "0" (loops), "r" (1));
-}
-/*
- * Division by multiplication: you don't have to worry about
- * loss of precision.
- *
- * Use only for very small delays ( < 1 msec).  Should probably use a
- * lookup table, really, as the multiplications take much too long with
- * short delays.  This is a "reasonable" implementation, though (and the
- * first constant multiplications gets optimized away if the delay is
- * a constant)
- */
-static inline void __udelay(unsigned long usecs, unsigned long lpj)
-{
-        unsigned long hi, lo;
-        /*
-         * The rates of 128 is rounded wrongly by the catchall case
-         * for 64-bit.  Excessive precission?  Probably ...
-         */
-#if defined(CONFIG_64BIT) && (HZ == 128)
-        usecs *= 0x0008637bd05af6c7UL;          /* 2**64 / (1000000 / HZ) */
-#elif defined(CONFIG_64BIT)
-        usecs *= (0x8000000000000000UL / (500000 / HZ));
-#else /* 32-bit junk follows here */
-        usecs *= (unsigned long) (((0x8000000000000000ULL / (500000 / HZ)) +
-                                   0x80000000ULL) >> 32);
-#endif
-        if (sizeof(long) == 4)
-                __asm__("multu\t%2, %3"
-                : "=h" (usecs), "=l" (lo)
-                : "r" (usecs), "r" (lpj)
-                : GCC_REG_ACCUM);
-        else if (sizeof(long) == 8 && !R4000_WAR)
-                __asm__("dmultu\t%2, %3"
-                : "=h" (usecs), "=l" (lo)
-                : "r" (usecs), "r" (lpj)
-                : GCC_REG_ACCUM);
-        else if (sizeof(long) == 8 && R4000_WAR)
-                __asm__("dmultu\t%3, %4\n\tmfhi\t%0"
-                : "=r" (usecs), "=h" (hi), "=l" (lo)
-                : "r" (usecs), "r" (lpj)
-                : GCC_REG_ACCUM);
-        __delay(usecs);
-}
-#define __udelay_val cpu_data[raw_smp_processor_id()].udelay_val
-#define udelay(usecs) __udelay((usecs), __udelay_val)
 /* make sure "usecs *= ..." in udelay do not overflow. */
 #if HZ >= 1000
diff --git a/arch/mips/kernel/proc.c b/arch/mips/kernel/proc.c
index 26760cad8b69..e0a4ac18fa07 100644
--- a/arch/mips/kernel/proc.c
+++ b/arch/mips/kernel/proc.c
@@ -42,7 +42,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
        seq_printf(m, fmt, __cpu_name[n],
                                   (version >> 4) & 0x0f, version & 0x0f,
                                   (fp_vers >> 4) & 0x0f, fp_vers & 0x0f);
-        seq_printf(m, "BogoMIPS\t\t: %lu.%02lu\n",
+        seq_printf(m, "BogoMIPS\t\t: %u.%02u\n",
                      cpu_data[n].udelay_val / (500000/HZ),
                      (cpu_data[n].udelay_val / (5000/HZ)) % 100);
        seq_printf(m, "wait instruction\t: %s\n", cpu_wait ? "yes" : "no");
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index c13c7ad2cdae..2adead5a8a37 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -2,8 +2,8 @@
 # Makefile for MIPS-specific library files..
 #
-lib-y   += csum_partial.o memcpy.o memcpy-inatomic.o memset.o strlen_user.o \
+lib-y   += csum_partial.o delay.o memcpy.o memcpy-inatomic.o memset.o \
-           strncpy_user.o strnlen_user.o uncached.o
+           strlen_user.o strncpy_user.o strnlen_user.o uncached.o
 obj-y                   += iomap.o
 obj-$(CONFIG_PCI)       += iomap-pci.o
diff --git a/arch/mips/lib/delay.c b/arch/mips/lib/delay.c
new file mode 100644
index 000000000000..f69c6b569eb3
--- /dev/null
+++ b/arch/mips/lib/delay.c
@@ -0,0 +1,56 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1994 by Waldorf Electronics
+ * Copyright (C) 1995 - 2000, 01, 03 by Ralf Baechle
+ * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
+ * Copyright (C) 2007  Maciej W. Rozycki
+ */
+#include <linux/module.h>
+#include <linux/param.h>
+#include <linux/smp.h>
+#include <asm/compiler.h>
+#include <asm/war.h>
+inline void __delay(unsigned int loops)
+{
+        __asm__ __volatile__ (
+        "       .set    noreorder                               \n"
+        "       .align  3                                       \n"
+        "1:     bnez    %0, 1b                                  \n"
+        "       subu    %0, 1                                   \n"
+        "       .set    reorder                                 \n"
+        : "=r" (loops)
+        : "0" (loops));
+}
+EXPORT_SYMBOL(__delay);
+/*
+ * Division by multiplication: you don't have to worry about
+ * loss of precision.
+ *
+ * Use only for very small delays ( < 1 msec).  Should probably use a
+ * lookup table, really, as the multiplications take much too long with
+ * short delays.  This is a "reasonable" implementation, though (and the
+ * first constant multiplications gets optimized away if the delay is
+ * a constant)
+ */
+void __udelay(unsigned long us)
+{
+        unsigned int lpj = current_cpu_data.udelay_val;
+        __delay((us * 0x000010c7 * HZ * lpj) >> 32);
+}
+EXPORT_SYMBOL(__udelay);
+void __ndelay(unsigned long ns)
+{
+        unsigned int lpj = current_cpu_data.udelay_val;
+        __delay((us * 0x00000005 * HZ * lpj) >> 32);
+}
+EXPORT_SYMBOL(__ndelay);
author	Ralf Baechle <ralf@linux-mips.org>	2009-02-28 04:44:28 -0500
committer	Ralf Baechle <ralf@linux-mips.org>	2009-06-08 11:57:51 -0400
commit	5636919b5c909fee54a6ef5226475ecae012ad02 (patch)
tree	c77fa89c56ee2d493fb82117ab5dbc5b28a8deeb
parent	3a553147eaad5d4de90ab1f695aa13ddbea684ec (diff)