Added missing tegra files.HEAD master

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-22 10:38:37 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-22 10:38:37 -0500
commit: fcc9d2e5a6c89d22b8b773a64fb4ad21ac318446 (patch)
tree: a57612d1888735a2ec7972891b68c1ac5ec8faea /arch/sparc/lib
parent: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (diff)
11 files changed, 2272 insertions, 0 deletions
diff --git a/arch/sparc/lib/atomic_32.S b/arch/sparc/lib/atomic_32.S
new file mode 100644
index 00000000000..178cbb8ae1b
--- /dev/null
+++ b/arch/sparc/lib/atomic_32.S
@@ -0,0 +1,99 @@
+/* atomic.S: Move this stuff here for better ICACHE hit rates.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu)
+ */
+#include <asm/ptrace.h>
+#include <asm/psr.h>
+        .text
+        .align  4
+        .globl  __atomic_begin
+__atomic_begin:
+#ifndef CONFIG_SMP
+        .globl  ___xchg32_sun4c
+___xchg32_sun4c:
+        rd      %psr, %g3
+        andcc   %g3, PSR_PIL, %g0
+        bne     1f
+         nop
+        wr      %g3, PSR_PIL, %psr
+        nop; nop; nop
+1:
+        andcc   %g3, PSR_PIL, %g0
+        ld      [%g1], %g7
+        bne     1f
+         st     %g2, [%g1]
+        wr      %g3, 0x0, %psr
+        nop; nop; nop
+1:
+        mov     %g7, %g2
+        jmpl    %o7 + 8, %g0
+         mov    %g4, %o7
+        .globl  ___xchg32_sun4md
+___xchg32_sun4md:
+        swap    [%g1], %g2
+        jmpl    %o7 + 8, %g0
+         mov    %g4, %o7
+#endif
+        /* Read asm-sparc/atomic.h carefully to understand how this works for SMP.
+         * Really, some things here for SMP are overly clever, go read the header.
+         */
+        .globl  ___atomic24_add
+___atomic24_add:
+        rd      %psr, %g3               ! Keep the code small, old way was stupid
+        nop; nop; nop;                  ! Let the bits set
+        or      %g3, PSR_PIL, %g7       ! Disable interrupts
+        wr      %g7, 0x0, %psr          ! Set %psr
+        nop; nop; nop;                  ! Let the bits set
+#ifdef CONFIG_SMP
+1:      ldstub  [%g1 + 3], %g7          ! Spin on the byte lock for SMP.
+        orcc    %g7, 0x0, %g0           ! Did we get it?
+        bne     1b                      ! Nope...
+         ld     [%g1], %g7              ! Load locked atomic24_t
+        sra     %g7, 8, %g7             ! Get signed 24-bit integer
+        add     %g7, %g2, %g2           ! Add in argument
+        sll     %g2, 8, %g7             ! Transpose back to atomic24_t
+        st      %g7, [%g1]              ! Clever: This releases the lock as well.
+#else
+        ld      [%g1], %g7              ! Load locked atomic24_t
+        add     %g7, %g2, %g2           ! Add in argument
+        st      %g2, [%g1]              ! Store it back
+#endif
+        wr      %g3, 0x0, %psr          ! Restore original PSR_PIL
+        nop; nop; nop;                  ! Let the bits set
+        jmpl    %o7, %g0                ! NOTE: not + 8, see callers in atomic.h
+         mov    %g4, %o7                ! Restore %o7
+        .globl  ___atomic24_sub
+___atomic24_sub:
+        rd      %psr, %g3               ! Keep the code small, old way was stupid
+        nop; nop; nop;                  ! Let the bits set
+        or      %g3, PSR_PIL, %g7       ! Disable interrupts
+        wr      %g7, 0x0, %psr          ! Set %psr
+        nop; nop; nop;                  ! Let the bits set
+#ifdef CONFIG_SMP
+1:      ldstub  [%g1 + 3], %g7          ! Spin on the byte lock for SMP.
+        orcc    %g7, 0x0, %g0           ! Did we get it?
+        bne     1b                      ! Nope...
+         ld     [%g1], %g7              ! Load locked atomic24_t
+        sra     %g7, 8, %g7             ! Get signed 24-bit integer
+        sub     %g7, %g2, %g2           ! Subtract argument
+        sll     %g2, 8, %g7             ! Transpose back to atomic24_t
+        st      %g7, [%g1]              ! Clever: This releases the lock as well
+#else
+        ld      [%g1], %g7              ! Load locked atomic24_t
+        sub     %g7, %g2, %g2           ! Subtract argument
+        st      %g2, [%g1]              ! Store it back
+#endif
+        wr      %g3, 0x0, %psr          ! Restore original PSR_PIL
+        nop; nop; nop;                  ! Let the bits set
+        jmpl    %o7, %g0                ! NOTE: not + 8, see callers in atomic.h
+         mov    %g4, %o7                ! Restore %o7
+        .globl  __atomic_end
+__atomic_end:
diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S
new file mode 100644
index 00000000000..c45470d0b0c
--- /dev/null
+++ b/arch/sparc/lib/mul.S
@@ -0,0 +1,137 @@
+/*
+ * mul.S:       This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+/*
+ * Signed multiply, from Appendix E of the Sparc Version 8
+ * Architecture Manual.
+ */
+/*
+ * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
+ * the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.
+ */
+        .globl .mul
+        .globl _Mul
+.mul:
+_Mul:   /* needed for export */
+        mov     %o0, %y         ! multiplier -> Y
+        andncc  %o0, 0xfff, %g0 ! test bits 12..31
+        be      Lmul_shortway   ! if zero, can do it the short way
+         andcc  %g0, %g0, %o4   ! zero the partial product and clear N and V
+        /*
+         * Long multiply.  32 steps, followed by a final shift step.
+         */
+        mulscc  %o4, %o1, %o4   ! 1
+        mulscc  %o4, %o1, %o4   ! 2
+        mulscc  %o4, %o1, %o4   ! 3
+        mulscc  %o4, %o1, %o4   ! 4
+        mulscc  %o4, %o1, %o4   ! 5
+        mulscc  %o4, %o1, %o4   ! 6
+        mulscc  %o4, %o1, %o4   ! 7
+        mulscc  %o4, %o1, %o4   ! 8
+        mulscc  %o4, %o1, %o4   ! 9
+        mulscc  %o4, %o1, %o4   ! 10
+        mulscc  %o4, %o1, %o4   ! 11
+        mulscc  %o4, %o1, %o4   ! 12
+        mulscc  %o4, %o1, %o4   ! 13
+        mulscc  %o4, %o1, %o4   ! 14
+        mulscc  %o4, %o1, %o4   ! 15
+        mulscc  %o4, %o1, %o4   ! 16
+        mulscc  %o4, %o1, %o4   ! 17
+        mulscc  %o4, %o1, %o4   ! 18
+        mulscc  %o4, %o1, %o4   ! 19
+        mulscc  %o4, %o1, %o4   ! 20
+        mulscc  %o4, %o1, %o4   ! 21
+        mulscc  %o4, %o1, %o4   ! 22
+        mulscc  %o4, %o1, %o4   ! 23
+        mulscc  %o4, %o1, %o4   ! 24
+        mulscc  %o4, %o1, %o4   ! 25
+        mulscc  %o4, %o1, %o4   ! 26
+        mulscc  %o4, %o1, %o4   ! 27
+        mulscc  %o4, %o1, %o4   ! 28
+        mulscc  %o4, %o1, %o4   ! 29
+        mulscc  %o4, %o1, %o4   ! 30
+        mulscc  %o4, %o1, %o4   ! 31
+        mulscc  %o4, %o1, %o4   ! 32
+        mulscc  %o4, %g0, %o4   ! final shift
+        ! If %o0 was negative, the result is
+        !       (%o0 * %o1) + (%o1 << 32))
+        ! We fix that here.
+#if 0
+        tst     %o0
+        bge     1f
+         rd     %y, %o0
+        ! %o0 was indeed negative; fix upper 32 bits of result by subtracting 
+        ! %o1 (i.e., return %o4 - %o1 in %o1).
+        retl
+         sub    %o4, %o1, %o1
+1:
+        retl
+         mov    %o4, %o1
+#else
+        /* Faster code adapted from tege@sics.se's code for umul.S.  */
+        sra     %o0, 31, %o2    ! make mask from sign bit
+        and     %o1, %o2, %o2   ! %o2 = 0 or %o1, depending on sign of %o0
+        rd      %y, %o0         ! get lower half of product
+        retl
+         sub    %o4, %o2, %o1   ! subtract compensation 
+                                !  and put upper half in place
+#endif
+Lmul_shortway:
+        /*
+         * Short multiply.  12 steps, followed by a final shift step.
+         * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+         * but there is no problem with %o0 being negative (unlike above).
+         */
+        mulscc  %o4, %o1, %o4   ! 1
+        mulscc  %o4, %o1, %o4   ! 2
+        mulscc  %o4, %o1, %o4   ! 3
+        mulscc  %o4, %o1, %o4   ! 4
+        mulscc  %o4, %o1, %o4   ! 5
+        mulscc  %o4, %o1, %o4   ! 6
+        mulscc  %o4, %o1, %o4   ! 7
+        mulscc  %o4, %o1, %o4   ! 8
+        mulscc  %o4, %o1, %o4   ! 9
+        mulscc  %o4, %o1, %o4   ! 10
+        mulscc  %o4, %o1, %o4   ! 11
+        mulscc  %o4, %o1, %o4   ! 12
+        mulscc  %o4, %g0, %o4   ! final shift
+        /*
+         *  %o4 has 20 of the bits that should be in the low part of the
+         * result; %y has the bottom 12 (as %y's top 12).  That is:
+         *
+         *        %o4               %y
+         * +----------------+----------------+
+         * | -12- |   -20-  | -12- |   -20-  |
+         * +------(---------+------)---------+
+         *  --hi-- ----low-part----
+         *
+         * The upper 12 bits of %o4 should be sign-extended to form the
+         * high part of the product (i.e., highpart = %o4 >> 20).
+         */
+        rd      %y, %o5
+        sll     %o4, 12, %o0    ! shift middle bits left 12
+        srl     %o5, 20, %o5    ! shift low bits right 20, zero fill at left
+        or      %o5, %o0, %o0   ! construct low part of result
+        retl
+         sra    %o4, 20, %o1    ! ... and extract high part of result
+        .globl  .mul_patch
+.mul_patch:
+        smul    %o0, %o1, %o0
+        retl
+         rd     %y, %o1
+        nop
diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S
new file mode 100644
index 00000000000..42fb8625281
--- /dev/null
+++ b/arch/sparc/lib/rem.S
@@ -0,0 +1,384 @@
+/*
+ * rem.S:       This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .rem        name of function to generate
+ *  rem         rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  true                true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N           how many bits per iteration we try to get (4)
+ *  WORDSIZE    total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS     number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q           the partial quotient under development (initially 0)
+ *  R           the remainder so far, initially the dividend
+ *  ITER        number of main division loop iterations required;
+ *              equal to ceil(log2(quotient) / N).  Note that this
+ *              is the log base (2^N) of the quotient.
+ *  V           the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *      ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+        .globl .rem
+        .globl _Rem
+.rem:
+_Rem:   /* needed for export */
+        ! compute sign of result; if neither is negative, no problem
+        orcc    %o1, %o0, %g0   ! either negative?
+        bge     2f                      ! no, go do the divide
+         mov    %o0, %g2        ! compute sign in any case
+        tst     %o1
+        bge     1f
+         tst    %o0
+        ! %o1 is definitely negative; %o0 might also be negative
+        bge     2f                      ! if %o0 not negative...
+         sub    %g0, %o1, %o1   ! in any case, make %o1 nonneg
+1:      ! %o0 is negative, %o1 is nonnegative
+        sub     %g0, %o0, %o0   ! make %o0 nonnegative
+2:
+        ! Ready to divide.  Compute size of quotient; scale comparand.
+        orcc    %o1, %g0, %o5
+        bne     1f
+         mov    %o0, %o3
+                ! Divide by zero trap.  If it returns, return 0 (about as
+                ! wrong as possible, but that is what SunOS does...).
+                ta      ST_DIV0
+                retl
+                 clr    %o0
+1:
+        cmp     %o3, %o5                        ! if %o1 exceeds %o0, done
+        blu     Lgot_result             ! (and algorithm fails otherwise)
+         clr    %o2
+        sethi   %hi(1 << (32 - 4 - 1)), %g1
+        cmp     %o3, %g1
+        blu     Lnot_really_big
+         clr    %o4
+        ! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+        ! as our usual N-at-a-shot divide step will cause overflow and havoc.
+        ! The number of bits in the result here is N*ITER+SC, where SC <= N.
+        ! Compute ITER in an unorthodox manner: know we need to shift V into
+        ! the top decade: so do not even bother to compare to R.
+        1:
+                cmp     %o5, %g1
+                bgeu    3f
+                 mov    1, %g7
+                sll     %o5, 4, %o5
+                b       1b
+                 add    %o4, 1, %o4
+        ! Now compute %g7.
+        2:
+                addcc   %o5, %o5, %o5
+                bcc     Lnot_too_big
+                 add    %g7, 1, %g7
+                ! We get here if the %o1 overflowed while shifting.
+                ! This means that %o3 has the high-order bit set.
+                ! Restore %o5 and subtract from %o3.
+                sll     %g1, 4, %g1     ! high order bit
+                srl     %o5, 1, %o5             ! rest of %o5
+                add     %o5, %g1, %o5
+                b       Ldo_single_div
+                 sub    %g7, 1, %g7
+        Lnot_too_big:
+        3:
+                cmp     %o5, %o3
+                blu     2b
+                 nop
+                be      Ldo_single_div
+                 nop
+        /* NB: these are commented out in the V8-Sparc manual as well */
+        /* (I do not understand this) */
+        ! %o5 > %o3: went too far: back up 1 step
+        !       srl     %o5, 1, %o5
+        !       dec     %g7
+        ! do single-bit divide steps
+        !
+        ! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+        ! first divide step without thinking.  BUT, the others are conditional,
+        ! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+        ! order bit set in the first step, just falling into the regular
+        ! division loop will mess up the first time around.
+        ! So we unroll slightly...
+        Ldo_single_div:
+                subcc   %g7, 1, %g7
+                bl      Lend_regular_divide
+                 nop
+                sub     %o3, %o5, %o3
+                mov     1, %o2
+                b       Lend_single_divloop
+                 nop
+        Lsingle_divloop:
+                sll     %o2, 1, %o2
+                bl      1f
+                 srl    %o5, 1, %o5
+                ! %o3 >= 0
+                sub     %o3, %o5, %o3
+                b       2f
+                 add    %o2, 1, %o2
+        1:      ! %o3 < 0
+                add     %o3, %o5, %o3
+                sub     %o2, 1, %o2
+        2:
+        Lend_single_divloop:
+                subcc   %g7, 1, %g7
+                bge     Lsingle_divloop
+                 tst    %o3
+                b,a     Lend_regular_divide
+Lnot_really_big:
+1:
+        sll     %o5, 4, %o5
+        cmp     %o5, %o3
+        bleu    1b
+         addcc  %o4, 1, %o4
+        be      Lgot_result
+         sub    %o4, 1, %o4
+        tst     %o3     ! set up for initial iteration
+Ldivloop:
+        sll     %o2, 4, %o2
+                ! depth 1, accumulated bits 0
+        bl      L.1.16
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits 1
+        bl      L.2.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 3
+        bl      L.3.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 7
+        bl      L.4.23
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2+1), %o2
+        
+L.4.23:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2-1), %o2
+        
+L.3.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 5
+        bl      L.4.21
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2+1), %o2
+        
+L.4.21:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2-1), %o2
+        
+L.2.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 1
+        bl      L.3.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 3
+        bl      L.4.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2+1), %o2
+L.4.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2-1), %o2
+L.3.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 1
+        bl      L.4.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2+1), %o2
+L.4.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2-1), %o2
+L.1.16:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits -1
+        bl      L.2.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -1
+        bl      L.3.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -1
+        bl      L.4.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2+1), %o2
+L.4.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2-1), %o2
+L.3.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -3
+        bl      L.4.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2+1), %o2
+L.4.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2-1), %o2
+L.2.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -3
+        bl      L.3.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -5
+        bl      L.4.11
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2+1), %o2
+L.4.11:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2-1), %o2
+L.3.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -7
+        bl      L.4.9
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2+1), %o2
+L.4.9:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2-1), %o2
+        9:
+Lend_regular_divide:
+        subcc   %o4, 1, %o4
+        bge     Ldivloop
+         tst    %o3
+        bl,a    Lgot_result
+        ! non-restoring fixup here (one instruction only!)
+        add     %o3, %o1, %o3
+Lgot_result:
+        ! check to see if answer should be < 0
+        tst     %g2
+        bl,a    1f
+         sub %g0, %o3, %o3
+1:
+        retl
+         mov %o3, %o0
+        .globl  .rem_patch
+.rem_patch:
+        sra     %o0, 0x1f, %o4
+        wr      %o4, 0x0, %y
+        nop
+        nop
+        nop
+        sdivcc  %o0, %o1, %o2
+        bvs,a   1f
+         xnor   %o2, %g0, %o2
+1:      smul    %o2, %o1, %o2
+        retl
+         sub    %o0, %o2, %o0
+        nop
diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S
new file mode 100644
index 00000000000..f0a0d4e4db7
--- /dev/null
+++ b/arch/sparc/lib/sdiv.S
@@ -0,0 +1,381 @@
+/*
+ * sdiv.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .div        name of function to generate
+ *  div         div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  true                true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N           how many bits per iteration we try to get (4)
+ *  WORDSIZE    total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS     number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q           the partial quotient under development (initially 0)
+ *  R           the remainder so far, initially the dividend
+ *  ITER        number of main division loop iterations required;
+ *              equal to ceil(log2(quotient) / N).  Note that this
+ *              is the log base (2^N) of the quotient.
+ *  V           the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *      ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+        .globl .div
+        .globl _Div
+.div:
+_Div:   /* needed for export */
+        ! compute sign of result; if neither is negative, no problem
+        orcc    %o1, %o0, %g0   ! either negative?
+        bge     2f                      ! no, go do the divide
+         xor    %o1, %o0, %g2   ! compute sign in any case
+        tst     %o1
+        bge     1f
+         tst    %o0
+        ! %o1 is definitely negative; %o0 might also be negative
+        bge     2f                      ! if %o0 not negative...
+         sub    %g0, %o1, %o1   ! in any case, make %o1 nonneg
+1:      ! %o0 is negative, %o1 is nonnegative
+        sub     %g0, %o0, %o0   ! make %o0 nonnegative
+2:
+        ! Ready to divide.  Compute size of quotient; scale comparand.
+        orcc    %o1, %g0, %o5
+        bne     1f
+         mov    %o0, %o3
+                ! Divide by zero trap.  If it returns, return 0 (about as
+                ! wrong as possible, but that is what SunOS does...).
+                ta      ST_DIV0
+                retl
+                 clr    %o0
+1:
+        cmp     %o3, %o5                        ! if %o1 exceeds %o0, done
+        blu     Lgot_result             ! (and algorithm fails otherwise)
+         clr    %o2
+        sethi   %hi(1 << (32 - 4 - 1)), %g1
+        cmp     %o3, %g1
+        blu     Lnot_really_big
+         clr    %o4
+        ! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+        ! as our usual N-at-a-shot divide step will cause overflow and havoc.
+        ! The number of bits in the result here is N*ITER+SC, where SC <= N.
+        ! Compute ITER in an unorthodox manner: know we need to shift V into
+        ! the top decade: so do not even bother to compare to R.
+        1:
+                cmp     %o5, %g1
+                bgeu    3f
+                 mov    1, %g7
+                sll     %o5, 4, %o5
+                b       1b
+                 add    %o4, 1, %o4
+        ! Now compute %g7.
+        2:
+                addcc   %o5, %o5, %o5
+                bcc     Lnot_too_big
+                 add    %g7, 1, %g7
+                ! We get here if the %o1 overflowed while shifting.
+                ! This means that %o3 has the high-order bit set.
+                ! Restore %o5 and subtract from %o3.
+                sll     %g1, 4, %g1     ! high order bit
+                srl     %o5, 1, %o5             ! rest of %o5
+                add     %o5, %g1, %o5
+                b       Ldo_single_div
+                 sub    %g7, 1, %g7
+        Lnot_too_big:
+        3:
+                cmp     %o5, %o3
+                blu     2b
+                 nop
+                be      Ldo_single_div
+                 nop
+        /* NB: these are commented out in the V8-Sparc manual as well */
+        /* (I do not understand this) */
+        ! %o5 > %o3: went too far: back up 1 step
+        !       srl     %o5, 1, %o5
+        !       dec     %g7
+        ! do single-bit divide steps
+        !
+        ! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+        ! first divide step without thinking.  BUT, the others are conditional,
+        ! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+        ! order bit set in the first step, just falling into the regular
+        ! division loop will mess up the first time around.
+        ! So we unroll slightly...
+        Ldo_single_div:
+                subcc   %g7, 1, %g7
+                bl      Lend_regular_divide
+                 nop
+                sub     %o3, %o5, %o3
+                mov     1, %o2
+                b       Lend_single_divloop
+                 nop
+        Lsingle_divloop:
+                sll     %o2, 1, %o2
+                bl      1f
+                 srl    %o5, 1, %o5
+                ! %o3 >= 0
+                sub     %o3, %o5, %o3
+                b       2f
+                 add    %o2, 1, %o2
+        1:      ! %o3 < 0
+                add     %o3, %o5, %o3
+                sub     %o2, 1, %o2
+        2:
+        Lend_single_divloop:
+                subcc   %g7, 1, %g7
+                bge     Lsingle_divloop
+                 tst    %o3
+                b,a     Lend_regular_divide
+Lnot_really_big:
+1:
+        sll     %o5, 4, %o5
+        cmp     %o5, %o3
+        bleu    1b
+         addcc  %o4, 1, %o4
+        be      Lgot_result
+         sub    %o4, 1, %o4
+        tst     %o3     ! set up for initial iteration
+Ldivloop:
+        sll     %o2, 4, %o2
+                ! depth 1, accumulated bits 0
+        bl      L.1.16
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits 1
+        bl      L.2.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 3
+        bl      L.3.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 7
+        bl      L.4.23
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2+1), %o2
+L.4.23:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2-1), %o2
+L.3.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 5
+        bl      L.4.21
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2+1), %o2
+L.4.21:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2-1), %o2
+L.2.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 1
+        bl      L.3.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 3
+        bl      L.4.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2+1), %o2
+L.4.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2-1), %o2
+        
+        
+L.3.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 1
+        bl      L.4.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2+1), %o2
+L.4.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2-1), %o2
+L.1.16:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits -1
+        bl      L.2.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -1
+        bl      L.3.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -1
+        bl      L.4.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2+1), %o2
+L.4.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2-1), %o2
+L.3.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -3
+        bl      L.4.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2+1), %o2
+L.4.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2-1), %o2
+L.2.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -3
+        bl      L.3.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -5
+        bl      L.4.11
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2+1), %o2
+L.4.11:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2-1), %o2
+L.3.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -7
+        bl      L.4.9
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2+1), %o2
+L.4.9:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2-1), %o2
+        9:
+Lend_regular_divide:
+        subcc   %o4, 1, %o4
+        bge     Ldivloop
+         tst    %o3
+        bl,a    Lgot_result
+        ! non-restoring fixup here (one instruction only!)
+        sub     %o2, 1, %o2
+Lgot_result:
+        ! check to see if answer should be < 0
+        tst     %g2
+        bl,a    1f
+         sub %g0, %o2, %o2
+1:
+        retl
+         mov %o2, %o0
+        .globl  .div_patch
+.div_patch:
+        sra     %o0, 0x1f, %o2
+        wr      %o2, 0x0, %y
+        nop
+        nop
+        nop
+        sdivcc  %o0, %o1, %o0
+        bvs,a   1f
+         xnor   %o0, %g0, %o0
+1:      retl
+         nop
diff --git a/arch/sparc/lib/strlen_user_32.S b/arch/sparc/lib/strlen_user_32.S
new file mode 100644
index 00000000000..8c8a371df3c
--- /dev/null
+++ b/arch/sparc/lib/strlen_user_32.S
@@ -0,0 +1,109 @@
+/* strlen_user.S: Sparc optimized strlen_user code
+ *
+ * Return length of string in userspace including terminating 0
+ * or 0 for error
+ *
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+10:
+        ldub    [%o0], %o5
+        cmp     %o5, 0
+        be      1f
+         add    %o0, 1, %o0
+        andcc   %o0, 3, %g0
+        be      4f
+         or     %o4, %lo(HI_MAGIC), %o3
+11:
+        ldub    [%o0], %o5
+        cmp     %o5, 0
+        be      2f
+         add    %o0, 1, %o0
+        andcc   %o0, 3, %g0
+        be      5f
+         sethi  %hi(LO_MAGIC), %o4
+12:
+        ldub    [%o0], %o5
+        cmp     %o5, 0
+        be      3f
+         add    %o0, 1, %o0
+        b       13f
+         or     %o4, %lo(LO_MAGIC), %o2
+1:
+        retl
+         mov    1, %o0
+2:
+        retl
+         mov    2, %o0
+3:
+        retl
+         mov    3, %o0
+        .align 4
+        .global __strlen_user, __strnlen_user
+__strlen_user:
+        sethi   %hi(32768), %o1
+__strnlen_user:
+        mov     %o1, %g1
+        mov     %o0, %o1
+        andcc   %o0, 3, %g0
+        bne     10b
+         sethi  %hi(HI_MAGIC), %o4
+        or      %o4, %lo(HI_MAGIC), %o3
+4:
+        sethi   %hi(LO_MAGIC), %o4
+5:
+        or      %o4, %lo(LO_MAGIC), %o2
+13:
+        ld      [%o0], %o5
+2:
+        sub     %o5, %o2, %o4
+        andcc   %o4, %o3, %g0
+        bne     82f
+         add    %o0, 4, %o0
+        sub     %o0, %o1, %g2
+81:     cmp     %g2, %g1
+        blu     13b
+         mov    %o0, %o4
+        ba,a    1f
+        /* Check every byte. */
+82:     srl     %o5, 24, %g5
+        andcc   %g5, 0xff, %g0
+        be      1f
+         add    %o0, -3, %o4
+        srl     %o5, 16, %g5
+        andcc   %g5, 0xff, %g0
+        be      1f
+         add    %o4, 1, %o4
+        srl     %o5, 8, %g5
+        andcc   %g5, 0xff, %g0
+        be      1f
+         add    %o4, 1, %o4
+        andcc   %o5, 0xff, %g0
+        bne     81b
+         sub    %o0, %o1, %g2
+        add     %o4, 1, %o4
+1:
+        retl
+         sub    %o4, %o1, %o0
+        .section .fixup,#alloc,#execinstr
+        .align  4
+9:
+        retl
+         clr    %o0
+        .section __ex_table,#alloc
+        .align  4
+        .word   10b, 9b
+        .word   11b, 9b
+        .word   12b, 9b
+        .word   13b, 9b
diff --git a/arch/sparc/lib/strlen_user_64.S b/arch/sparc/lib/strlen_user_64.S
new file mode 100644
index 00000000000..114ed111e25
--- /dev/null
+++ b/arch/sparc/lib/strlen_user_64.S
@@ -0,0 +1,95 @@
+/* strlen_user.S: Sparc64 optimized strlen_user code
+ *
+ * Return length of string in userspace including terminating 0
+ * or 0 for error
+ *
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996,1999 David S. Miller (davem@redhat.com)
+ * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+#include <asm/asi.h>
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+        .align 4
+        .global __strlen_user, __strnlen_user
+__strlen_user:
+        sethi   %hi(32768), %o1
+__strnlen_user: 
+        mov     %o1, %g1
+        mov     %o0, %o1
+        andcc   %o0, 3, %g0
+        be,pt   %icc, 9f
+         sethi  %hi(HI_MAGIC), %o4
+10:     lduba   [%o0] %asi, %o5
+        brz,pn  %o5, 21f
+         add    %o0, 1, %o0
+        andcc   %o0, 3, %g0
+        be,pn   %icc, 4f
+         or     %o4, %lo(HI_MAGIC), %o3
+11:     lduba   [%o0] %asi, %o5
+        brz,pn  %o5, 22f
+         add    %o0, 1, %o0
+        andcc   %o0, 3, %g0
+        be,pt   %icc, 13f
+         srl    %o3, 7, %o2
+12:     lduba   [%o0] %asi, %o5
+        brz,pn  %o5, 23f
+         add    %o0, 1, %o0
+        ba,pt   %icc, 2f
+15:      lda    [%o0] %asi, %o5
+9:      or      %o4, %lo(HI_MAGIC), %o3
+4:      srl     %o3, 7, %o2
+13:     lda     [%o0] %asi, %o5
+2:      sub     %o5, %o2, %o4
+        andcc   %o4, %o3, %g0
+        bne,pn  %icc, 82f
+         add    %o0, 4, %o0
+        sub     %o0, %o1, %g2
+81:     cmp     %g2, %g1
+        blu,pt  %icc, 13b
+         mov    %o0, %o4
+        ba,a,pt %xcc, 1f
+        /* Check every byte. */
+82:     srl     %o5, 24, %g7
+        andcc   %g7, 0xff, %g0
+        be,pn   %icc, 1f
+         add    %o0, -3, %o4
+        srl     %o5, 16, %g7
+        andcc   %g7, 0xff, %g0
+        be,pn   %icc, 1f
+         add    %o4, 1, %o4
+        srl     %o5, 8, %g7
+        andcc   %g7, 0xff, %g0
+        be,pn   %icc, 1f
+         add    %o4, 1, %o4
+        andcc   %o5, 0xff, %g0
+        bne,pt  %icc, 81b
+         sub    %o0, %o1, %g2
+        add     %o4, 1, %o4
+1:      retl
+         sub    %o4, %o1, %o0
+21:     retl
+         mov    1, %o0
+22:     retl
+         mov    2, %o0
+23:     retl
+         mov    3, %o0
+        .section .fixup,#alloc,#execinstr
+        .align  4
+30:
+        retl
+         clr    %o0
+        .section __ex_table,"a"
+        .align  4
+        .word   10b, 30b
+        .word   11b, 30b
+        .word   12b, 30b
+        .word   15b, 30b
+        .word   13b, 30b
diff --git a/arch/sparc/lib/strncpy_from_user_32.S b/arch/sparc/lib/strncpy_from_user_32.S
new file mode 100644
index 00000000000..d77198976a6
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user_32.S
@@ -0,0 +1,47 @@
+/* strncpy_from_user.S: Sparc strncpy from userspace.
+ *
+ *  Copyright(C) 1996 David S. Miller
+ */
+#include <asm/ptrace.h>
+#include <asm/errno.h>
+        .text
+        .align  4
+        /* Must return:
+         *
+         * -EFAULT              for an exception
+         * count                if we hit the buffer limit
+         * bytes copied         if we hit a null byte
+         */
+        .globl  __strncpy_from_user
+__strncpy_from_user:
+        /* %o0=dest, %o1=src, %o2=count */
+        mov     %o2, %o3
+1:
+        subcc   %o2, 1, %o2
+        bneg    2f
+         nop
+10:
+        ldub    [%o1], %o4
+        add     %o0, 1, %o0
+        cmp     %o4, 0
+        add     %o1, 1, %o1
+        bne     1b
+         stb    %o4, [%o0 - 1]
+2:
+        add     %o2, 1, %o0
+        retl
+         sub    %o3, %o0, %o0
+        .section .fixup,#alloc,#execinstr
+        .align  4
+4:
+        retl
+         mov    -EFAULT, %o0
+        .section __ex_table,#alloc
+        .align  4
+        .word   10b, 4b
diff --git a/arch/sparc/lib/strncpy_from_user_64.S b/arch/sparc/lib/strncpy_from_user_64.S
new file mode 100644
index 00000000000..511c8f136f9
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user_64.S
@@ -0,0 +1,135 @@
+/*
+ * strncpy_from_user.S: Sparc64 strncpy from userspace.
+ *
+ *  Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+#include <asm/asi.h>
+#include <asm/errno.h>
+        .data
+        .align  8
+0:      .xword  0x0101010101010101
+        .text
+        .align  32
+        /* Must return:
+         *
+         * -EFAULT              for an exception
+         * count                if we hit the buffer limit
+         * bytes copied         if we hit a null byte
+         * (without the null byte)
+         *
+         * This implementation assumes:
+         * %o1 is 8 aligned => !(%o2 & 7)
+         * %o0 is 8 aligned (if not, it will be slooooow, but will work)
+         *
+         * This is optimized for the common case:
+         * in my stats, 90% of src are 8 aligned (even on sparc32)
+         * and average length is 18 or so.
+         */
+        .globl  __strncpy_from_user
+        .type   __strncpy_from_user,#function
+__strncpy_from_user:
+        /* %o0=dest, %o1=src, %o2=count */
+        andcc   %o1, 7, %g0             ! IEU1  Group
+        bne,pn  %icc, 30f               ! CTI
+         add    %o0, %o2, %g3           ! IEU0
+60:     ldxa    [%o1] %asi, %g1         ! Load  Group
+        brlez,pn %o2, 10f               ! CTI
+         mov    %o0, %o3                ! IEU0
+50:     sethi   %hi(0b), %o4            ! IEU0  Group
+        ldx     [%o4 + %lo(0b)], %o4    ! Load
+        sllx    %o4, 7, %o5             ! IEU1  Group
+1:      sub     %g1, %o4, %g2           ! IEU0  Group
+        stx     %g1, [%o0]              ! Store
+        add     %o0, 8, %o0             ! IEU1
+        andcc   %g2, %o5, %g0           ! IEU1  Group
+        bne,pn  %xcc, 5f                ! CTI
+         add    %o1, 8, %o1             ! IEU0
+        cmp     %o0, %g3                ! IEU1  Group
+        bl,a,pt %xcc, 1b                ! CTI
+61:      ldxa   [%o1] %asi, %g1         ! Load
+10:     retl                            ! CTI   Group
+         mov    %o2, %o0                ! IEU0
+5:      srlx    %g2, 32, %g7            ! IEU0  Group
+        sethi   %hi(0xff00), %o4        ! IEU1
+        andcc   %g7, %o5, %g0           ! IEU1  Group
+        be,pn   %icc, 2f                ! CTI
+         or     %o4, %lo(0xff00), %o4   ! IEU0
+        srlx    %g1, 48, %g7            ! IEU0  Group
+        andcc   %g7, %o4, %g0           ! IEU1  Group
+        be,pn   %icc, 50f               ! CTI
+         andcc  %g7, 0xff, %g0          ! IEU1  Group
+        be,pn   %icc, 51f               ! CTI
+         srlx   %g1, 32, %g7            ! IEU0
+        andcc   %g7, %o4, %g0           ! IEU1  Group
+        be,pn   %icc, 52f               ! CTI
+         andcc  %g7, 0xff, %g0          ! IEU1  Group
+        be,pn   %icc, 53f               ! CTI
+2:       andcc  %g2, %o5, %g0           ! IEU1  Group
+        be,pn   %icc, 2f                ! CTI
+         srl    %g1, 16, %g7            ! IEU0
+        andcc   %g7, %o4, %g0           ! IEU1  Group
+        be,pn   %icc, 54f               ! CTI
+         andcc  %g7, 0xff, %g0          ! IEU1  Group
+        be,pn   %icc, 55f               ! CTI
+         andcc  %g1, %o4, %g0           ! IEU1  Group
+        be,pn   %icc, 56f               ! CTI
+         andcc  %g1, 0xff, %g0          ! IEU1  Group
+        be,a,pn %icc, 57f               ! CTI
+         sub    %o0, %o3, %o0           ! IEU0
+2:      cmp     %o0, %g3                ! IEU1  Group
+        bl,a,pt %xcc, 50b               ! CTI
+62:      ldxa   [%o1] %asi, %g1         ! Load
+        retl                            ! CTI   Group
+         mov    %o2, %o0                ! IEU0
+50:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 8, %o0
+51:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 7, %o0
+52:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 6, %o0
+53:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 5, %o0
+54:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 4, %o0
+55:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 3, %o0
+56:     sub     %o0, %o3, %o0
+        retl
+         sub    %o0, 2, %o0
+57:     retl
+         sub    %o0, 1, %o0
+30:     brlez,pn %o2, 3f
+         sub    %g0, %o2, %o3
+        add     %o0, %o2, %o0
+63:     lduba   [%o1] %asi, %o4
+1:      add     %o1, 1, %o1
+        brz,pn  %o4, 2f
+         stb    %o4, [%o0 + %o3]
+        addcc   %o3, 1, %o3
+        bne,pt  %xcc, 1b
+64:      lduba  [%o1] %asi, %o4
+3:      retl
+         mov    %o2, %o0
+2:      retl
+         add    %o2, %o3, %o0
+        .size   __strncpy_from_user, .-__strncpy_from_user
+        .section __ex_table,"a"
+        .align  4
+        .word   60b, __retl_efault
+        .word   61b, __retl_efault
+        .word   62b, __retl_efault
+        .word   63b, __retl_efault
+        .word   64b, __retl_efault
+        .previous
diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S
new file mode 100644
index 00000000000..2101405bdfc
--- /dev/null
+++ b/arch/sparc/lib/udiv.S
@@ -0,0 +1,357 @@
+/*
+ * udiv.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .udiv       name of function to generate
+ *  div         div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  false               false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N           how many bits per iteration we try to get (4)
+ *  WORDSIZE    total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS     number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q           the partial quotient under development (initially 0)
+ *  R           the remainder so far, initially the dividend
+ *  ITER        number of main division loop iterations required;
+ *              equal to ceil(log2(quotient) / N).  Note that this
+ *              is the log base (2^N) of the quotient.
+ *  V           the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *      ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+        .globl .udiv
+        .globl _Udiv
+.udiv:
+_Udiv:  /* needed for export */
+        ! Ready to divide.  Compute size of quotient; scale comparand.
+        orcc    %o1, %g0, %o5
+        bne     1f
+         mov    %o0, %o3
+                ! Divide by zero trap.  If it returns, return 0 (about as
+                ! wrong as possible, but that is what SunOS does...).
+                ta      ST_DIV0
+                retl
+                 clr    %o0
+1:
+        cmp     %o3, %o5                        ! if %o1 exceeds %o0, done
+        blu     Lgot_result             ! (and algorithm fails otherwise)
+         clr    %o2
+        sethi   %hi(1 << (32 - 4 - 1)), %g1
+        cmp     %o3, %g1
+        blu     Lnot_really_big
+         clr    %o4
+        ! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+        ! as our usual N-at-a-shot divide step will cause overflow and havoc.
+        ! The number of bits in the result here is N*ITER+SC, where SC <= N.
+        ! Compute ITER in an unorthodox manner: know we need to shift V into
+        ! the top decade: so do not even bother to compare to R.
+        1:
+                cmp     %o5, %g1
+                bgeu    3f
+                 mov    1, %g7
+                sll     %o5, 4, %o5
+                b       1b
+                 add    %o4, 1, %o4
+        ! Now compute %g7.
+        2:
+                addcc   %o5, %o5, %o5
+                bcc     Lnot_too_big
+                 add    %g7, 1, %g7
+                ! We get here if the %o1 overflowed while shifting.
+                ! This means that %o3 has the high-order bit set.
+                ! Restore %o5 and subtract from %o3.
+                sll     %g1, 4, %g1     ! high order bit
+                srl     %o5, 1, %o5             ! rest of %o5
+                add     %o5, %g1, %o5
+                b       Ldo_single_div
+                 sub    %g7, 1, %g7
+        Lnot_too_big:
+        3:
+                cmp     %o5, %o3
+                blu     2b
+                 nop
+                be      Ldo_single_div
+                 nop
+        /* NB: these are commented out in the V8-Sparc manual as well */
+        /* (I do not understand this) */
+        ! %o5 > %o3: went too far: back up 1 step
+        !       srl     %o5, 1, %o5
+        !       dec     %g7
+        ! do single-bit divide steps
+        !
+        ! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+        ! first divide step without thinking.  BUT, the others are conditional,
+        ! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+        ! order bit set in the first step, just falling into the regular
+        ! division loop will mess up the first time around.
+        ! So we unroll slightly...
+        Ldo_single_div:
+                subcc   %g7, 1, %g7
+                bl      Lend_regular_divide
+                 nop
+                sub     %o3, %o5, %o3
+                mov     1, %o2
+                b       Lend_single_divloop
+                 nop
+        Lsingle_divloop:
+                sll     %o2, 1, %o2
+                bl      1f
+                 srl    %o5, 1, %o5
+                ! %o3 >= 0
+                sub     %o3, %o5, %o3
+                b       2f
+                 add    %o2, 1, %o2
+        1:      ! %o3 < 0
+                add     %o3, %o5, %o3
+                sub     %o2, 1, %o2
+        2:
+        Lend_single_divloop:
+                subcc   %g7, 1, %g7
+                bge     Lsingle_divloop
+                 tst    %o3
+                b,a     Lend_regular_divide
+Lnot_really_big:
+1:
+        sll     %o5, 4, %o5
+        cmp     %o5, %o3
+        bleu    1b
+         addcc  %o4, 1, %o4
+        be      Lgot_result
+         sub    %o4, 1, %o4
+        tst     %o3     ! set up for initial iteration
+Ldivloop:
+        sll     %o2, 4, %o2
+                ! depth 1, accumulated bits 0
+        bl      L.1.16
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits 1
+        bl      L.2.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 3
+        bl      L.3.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 7
+        bl      L.4.23
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2+1), %o2
+L.4.23:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2-1), %o2
+L.3.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 5
+        bl      L.4.21
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2+1), %o2
+L.4.21:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2-1), %o2
+L.2.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 1
+        bl      L.3.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 3
+        bl      L.4.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2+1), %o2
+L.4.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2-1), %o2
+L.3.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 1
+        bl      L.4.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2+1), %o2
+L.4.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2-1), %o2
+L.1.16:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits -1
+        bl      L.2.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -1
+        bl      L.3.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -1
+        bl      L.4.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2+1), %o2
+L.4.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2-1), %o2
+L.3.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -3
+        bl      L.4.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2+1), %o2
+L.4.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2-1), %o2
+L.2.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -3
+        bl      L.3.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -5
+        bl      L.4.11
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2+1), %o2
+L.4.11:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2-1), %o2
+L.3.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -7
+        bl      L.4.9
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2+1), %o2
+L.4.9:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2-1), %o2
+        9:
+Lend_regular_divide:
+        subcc   %o4, 1, %o4
+        bge     Ldivloop
+         tst    %o3
+        bl,a    Lgot_result
+        ! non-restoring fixup here (one instruction only!)
+        sub     %o2, 1, %o2
+Lgot_result:
+        retl
+         mov %o2, %o0
+        .globl  .udiv_patch
+.udiv_patch:
+        wr      %g0, 0x0, %y
+        nop
+        nop
+        retl
+         udiv   %o0, %o1, %o0
+        nop
diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S
new file mode 100644
index 00000000000..1f36ae68252
--- /dev/null
+++ b/arch/sparc/lib/umul.S
@@ -0,0 +1,171 @@
+/*
+ * umul.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+/*
+ * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
+ * upper 32 bits of the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.  Short
+ * multiplies require 25 instruction cycles, and long ones require
+ * 45 instruction cycles.
+ *
+ * On return, overflow has occurred (%o1 is not zero) if and only if
+ * the Z condition code is clear, allowing, e.g., the following:
+ *
+ *      call    .umul
+ *      nop
+ *      bnz     overflow        (or tnz)
+ */
+        .globl .umul
+        .globl _Umul
+.umul:
+_Umul:  /* needed for export */
+        or      %o0, %o1, %o4
+        mov     %o0, %y         ! multiplier -> Y
+        andncc  %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
+        be      Lmul_shortway   ! if zero, can do it the short way
+         andcc  %g0, %g0, %o4   ! zero the partial product and clear N and V
+        /*
+         * Long multiply.  32 steps, followed by a final shift step.
+         */
+        mulscc  %o4, %o1, %o4   ! 1
+        mulscc  %o4, %o1, %o4   ! 2
+        mulscc  %o4, %o1, %o4   ! 3
+        mulscc  %o4, %o1, %o4   ! 4
+        mulscc  %o4, %o1, %o4   ! 5
+        mulscc  %o4, %o1, %o4   ! 6
+        mulscc  %o4, %o1, %o4   ! 7
+        mulscc  %o4, %o1, %o4   ! 8
+        mulscc  %o4, %o1, %o4   ! 9
+        mulscc  %o4, %o1, %o4   ! 10
+        mulscc  %o4, %o1, %o4   ! 11
+        mulscc  %o4, %o1, %o4   ! 12
+        mulscc  %o4, %o1, %o4   ! 13
+        mulscc  %o4, %o1, %o4   ! 14
+        mulscc  %o4, %o1, %o4   ! 15
+        mulscc  %o4, %o1, %o4   ! 16
+        mulscc  %o4, %o1, %o4   ! 17
+        mulscc  %o4, %o1, %o4   ! 18
+        mulscc  %o4, %o1, %o4   ! 19
+        mulscc  %o4, %o1, %o4   ! 20
+        mulscc  %o4, %o1, %o4   ! 21
+        mulscc  %o4, %o1, %o4   ! 22
+        mulscc  %o4, %o1, %o4   ! 23
+        mulscc  %o4, %o1, %o4   ! 24
+        mulscc  %o4, %o1, %o4   ! 25
+        mulscc  %o4, %o1, %o4   ! 26
+        mulscc  %o4, %o1, %o4   ! 27
+        mulscc  %o4, %o1, %o4   ! 28
+        mulscc  %o4, %o1, %o4   ! 29
+        mulscc  %o4, %o1, %o4   ! 30
+        mulscc  %o4, %o1, %o4   ! 31
+        mulscc  %o4, %o1, %o4   ! 32
+        mulscc  %o4, %g0, %o4   ! final shift
+        /*
+         * Normally, with the shift-and-add approach, if both numbers are
+         * positive you get the correct result.  With 32-bit two's-complement
+         * numbers, -x is represented as
+         *
+         *                x                 32
+         *      ( 2  -  ------ ) mod 2  *  2
+         *                 32
+         *                2
+         *
+         * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
+         * we can treat this as if the radix point were just to the left
+         * of the sign bit (multiply by 2^32), and get
+         *
+         *      -x  =  (2 - x) mod 2
+         *
+         * Then, ignoring the `mod 2's for convenience:
+         *
+         *   x *  y     = xy
+         *  -x *  y     = 2y - xy
+         *   x * -y     = 2x - xy
+         *  -x * -y     = 4 - 2x - 2y + xy
+         *
+         * For signed multiplies, we subtract (x << 32) from the partial
+         * product to fix this problem for negative multipliers (see mul.s).
+         * Because of the way the shift into the partial product is calculated
+         * (N xor V), this term is automatically removed for the multiplicand,
+         * so we don't have to adjust.
+         *
+         * But for unsigned multiplies, the high order bit wasn't a sign bit,
+         * and the correction is wrong.  So for unsigned multiplies where the
+         * high order bit is one, we end up with xy - (y << 32).  To fix it
+         * we add y << 32.
+         */
+#if 0
+        tst     %o1
+        bl,a    1f              ! if %o1 < 0 (high order bit = 1),
+         add    %o4, %o0, %o4   ! %o4 += %o0 (add y to upper half)
+1:
+        rd      %y, %o0         ! get lower half of product
+        retl
+         addcc  %o4, %g0, %o1   ! put upper half in place and set Z for %o1==0
+#else
+        /* Faster code from tege@sics.se.  */
+        sra     %o1, 31, %o2    ! make mask from sign bit
+        and     %o0, %o2, %o2   ! %o2 = 0 or %o0, depending on sign of %o1
+        rd      %y, %o0         ! get lower half of product
+        retl
+         addcc  %o4, %o2, %o1   ! add compensation and put upper half in place
+#endif
+Lmul_shortway:
+        /*
+         * Short multiply.  12 steps, followed by a final shift step.
+         * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+         * but there is no problem with %o0 being negative (unlike above),
+         * and overflow is impossible (the answer is at most 24 bits long).
+         */
+        mulscc  %o4, %o1, %o4   ! 1
+        mulscc  %o4, %o1, %o4   ! 2
+        mulscc  %o4, %o1, %o4   ! 3
+        mulscc  %o4, %o1, %o4   ! 4
+        mulscc  %o4, %o1, %o4   ! 5
+        mulscc  %o4, %o1, %o4   ! 6
+        mulscc  %o4, %o1, %o4   ! 7
+        mulscc  %o4, %o1, %o4   ! 8
+        mulscc  %o4, %o1, %o4   ! 9
+        mulscc  %o4, %o1, %o4   ! 10
+        mulscc  %o4, %o1, %o4   ! 11
+        mulscc  %o4, %o1, %o4   ! 12
+        mulscc  %o4, %g0, %o4   ! final shift
+        /*
+         * %o4 has 20 of the bits that should be in the result; %y has
+         * the bottom 12 (as %y's top 12).  That is:
+         *
+         *        %o4               %y
+         * +----------------+----------------+
+         * | -12- |   -20-  | -12- |   -20-  |
+         * +------(---------+------)---------+
+         *         -----result-----
+         *
+         * The 12 bits of %o4 left of the `result' area are all zero;
+         * in fact, all top 20 bits of %o4 are zero.
+         */
+        rd      %y, %o5
+        sll     %o4, 12, %o0    ! shift middle bits left 12
+        srl     %o5, 20, %o5    ! shift low bits right 20
+        or      %o5, %o0, %o0
+        retl
+         addcc  %g0, %g0, %o1   ! %o1 = zero, and set Z
+        .globl  .umul_patch
+.umul_patch:
+        umul    %o0, %o1, %o0
+        retl
+         rd     %y, %o1
+        nop
diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S
new file mode 100644
index 00000000000..77123eb83c4
--- /dev/null
+++ b/arch/sparc/lib/urem.S
@@ -0,0 +1,357 @@
+/*
+ * urem.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .urem       name of function to generate
+ *  rem         rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  false               false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N           how many bits per iteration we try to get (4)
+ *  WORDSIZE    total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS     number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q           the partial quotient under development (initially 0)
+ *  R           the remainder so far, initially the dividend
+ *  ITER        number of main division loop iterations required;
+ *              equal to ceil(log2(quotient) / N).  Note that this
+ *              is the log base (2^N) of the quotient.
+ *  V           the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *      ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+        .globl .urem
+        .globl _Urem
+.urem:
+_Urem:  /* needed for export */
+        ! Ready to divide.  Compute size of quotient; scale comparand.
+        orcc    %o1, %g0, %o5
+        bne     1f
+         mov    %o0, %o3
+                ! Divide by zero trap.  If it returns, return 0 (about as
+                ! wrong as possible, but that is what SunOS does...).
+                ta      ST_DIV0
+                retl
+                 clr    %o0
+1:
+        cmp     %o3, %o5                        ! if %o1 exceeds %o0, done
+        blu     Lgot_result             ! (and algorithm fails otherwise)
+         clr    %o2
+        sethi   %hi(1 << (32 - 4 - 1)), %g1
+        cmp     %o3, %g1
+        blu     Lnot_really_big
+         clr    %o4
+        ! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+        ! as our usual N-at-a-shot divide step will cause overflow and havoc.
+        ! The number of bits in the result here is N*ITER+SC, where SC <= N.
+        ! Compute ITER in an unorthodox manner: know we need to shift V into
+        ! the top decade: so do not even bother to compare to R.
+        1:
+                cmp     %o5, %g1
+                bgeu    3f
+                 mov    1, %g7
+                sll     %o5, 4, %o5
+                b       1b
+                 add    %o4, 1, %o4
+        ! Now compute %g7.
+        2:
+                addcc   %o5, %o5, %o5
+                bcc     Lnot_too_big
+                 add    %g7, 1, %g7
+                ! We get here if the %o1 overflowed while shifting.
+                ! This means that %o3 has the high-order bit set.
+                ! Restore %o5 and subtract from %o3.
+                sll     %g1, 4, %g1     ! high order bit
+                srl     %o5, 1, %o5             ! rest of %o5
+                add     %o5, %g1, %o5
+                b       Ldo_single_div
+                 sub    %g7, 1, %g7
+        Lnot_too_big:
+        3:
+                cmp     %o5, %o3
+                blu     2b
+                 nop
+                be      Ldo_single_div
+                 nop
+        /* NB: these are commented out in the V8-Sparc manual as well */
+        /* (I do not understand this) */
+        ! %o5 > %o3: went too far: back up 1 step
+        !       srl     %o5, 1, %o5
+        !       dec     %g7
+        ! do single-bit divide steps
+        !
+        ! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+        ! first divide step without thinking.  BUT, the others are conditional,
+        ! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+        ! order bit set in the first step, just falling into the regular
+        ! division loop will mess up the first time around.
+        ! So we unroll slightly...
+        Ldo_single_div:
+                subcc   %g7, 1, %g7
+                bl      Lend_regular_divide
+                 nop
+                sub     %o3, %o5, %o3
+                mov     1, %o2
+                b       Lend_single_divloop
+                 nop
+        Lsingle_divloop:
+                sll     %o2, 1, %o2
+                bl      1f
+                 srl    %o5, 1, %o5
+                ! %o3 >= 0
+                sub     %o3, %o5, %o3
+                b       2f
+                 add    %o2, 1, %o2
+        1:      ! %o3 < 0
+                add     %o3, %o5, %o3
+                sub     %o2, 1, %o2
+        2:
+        Lend_single_divloop:
+                subcc   %g7, 1, %g7
+                bge     Lsingle_divloop
+                 tst    %o3
+                b,a     Lend_regular_divide
+Lnot_really_big:
+1:
+        sll     %o5, 4, %o5
+        cmp     %o5, %o3
+        bleu    1b
+         addcc  %o4, 1, %o4
+        be      Lgot_result
+         sub    %o4, 1, %o4
+        tst     %o3     ! set up for initial iteration
+Ldivloop:
+        sll     %o2, 4, %o2
+                ! depth 1, accumulated bits 0
+        bl      L.1.16
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits 1
+        bl      L.2.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 3
+        bl      L.3.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 7
+        bl      L.4.23
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2+1), %o2
+L.4.23:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (7*2-1), %o2
+L.3.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 5
+        bl      L.4.21
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2+1), %o2
+L.4.21:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (5*2-1), %o2
+L.2.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits 1
+        bl      L.3.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 3
+        bl      L.4.19
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2+1), %o2
+L.4.19:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (3*2-1), %o2
+L.3.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits 1
+        bl      L.4.17
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2+1), %o2
+        
+L.4.17:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (1*2-1), %o2
+L.1.16:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 2, accumulated bits -1
+        bl      L.2.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -1
+        bl      L.3.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -1
+        bl      L.4.15
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2+1), %o2
+L.4.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-1*2-1), %o2
+L.3.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -3
+        bl      L.4.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2+1), %o2
+L.4.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-3*2-1), %o2
+L.2.15:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 3, accumulated bits -3
+        bl      L.3.13
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -5
+        bl      L.4.11
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2+1), %o2
+        
+L.4.11:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-5*2-1), %o2
+L.3.13:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+                        ! depth 4, accumulated bits -7
+        bl      L.4.9
+         srl    %o5,1,%o5
+        ! remainder is positive
+        subcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2+1), %o2
+L.4.9:
+        ! remainder is negative
+        addcc   %o3,%o5,%o3
+        b       9f
+         add    %o2, (-7*2-1), %o2
+        9:
+Lend_regular_divide:
+        subcc   %o4, 1, %o4
+        bge     Ldivloop
+         tst    %o3
+        bl,a    Lgot_result
+        ! non-restoring fixup here (one instruction only!)
+        add     %o3, %o1, %o3
+Lgot_result:
+        retl
+         mov %o3, %o0
+        .globl  .urem_patch
+.urem_patch:
+        wr      %g0, 0x0, %y
+        nop
+        nop
+        nop
+        udiv    %o0, %o1, %o2
+        umul    %o2, %o1, %o2
+        retl
+         sub    %o0, %o2, %o0
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-22 10:38:37 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-22 10:38:37 -0500
commit	fcc9d2e5a6c89d22b8b773a64fb4ad21ac318446 (patch)
tree	a57612d1888735a2ec7972891b68c1ac5ec8faea /arch/sparc/lib
parent	8dea78da5cee153b8af9c07a2745f6c55057fe12 (diff)