From fcc9d2e5a6c89d22b8b773a64fb4ad21ac318446 Mon Sep 17 00:00:00 2001
From: Jonathan Herman <hermanjl@cs.unc.edu>
Date: Tue, 22 Jan 2013 10:38:37 -0500
Subject: Added missing tegra files.

---
 arch/sparc/lib/atomic_32.S            |  99 +++++++++
 arch/sparc/lib/mul.S                  | 137 ++++++++++++
 arch/sparc/lib/rem.S                  | 384 ++++++++++++++++++++++++++++++++++
 arch/sparc/lib/sdiv.S                 | 381 +++++++++++++++++++++++++++++++++
 arch/sparc/lib/strlen_user_32.S       | 109 ++++++++++
 arch/sparc/lib/strlen_user_64.S       |  95 +++++++++
 arch/sparc/lib/strncpy_from_user_32.S |  47 +++++
 arch/sparc/lib/strncpy_from_user_64.S | 135 ++++++++++++
 arch/sparc/lib/udiv.S                 | 357 +++++++++++++++++++++++++++++++
 arch/sparc/lib/umul.S                 | 171 +++++++++++++++
 arch/sparc/lib/urem.S                 | 357 +++++++++++++++++++++++++++++++
 11 files changed, 2272 insertions(+)
 create mode 100644 arch/sparc/lib/atomic_32.S
 create mode 100644 arch/sparc/lib/mul.S
 create mode 100644 arch/sparc/lib/rem.S
 create mode 100644 arch/sparc/lib/sdiv.S
 create mode 100644 arch/sparc/lib/strlen_user_32.S
 create mode 100644 arch/sparc/lib/strlen_user_64.S
 create mode 100644 arch/sparc/lib/strncpy_from_user_32.S
 create mode 100644 arch/sparc/lib/strncpy_from_user_64.S
 create mode 100644 arch/sparc/lib/udiv.S
 create mode 100644 arch/sparc/lib/umul.S
 create mode 100644 arch/sparc/lib/urem.S

(limited to 'arch/sparc/lib')

diff --git a/arch/sparc/lib/atomic_32.S b/arch/sparc/lib/atomic_32.S
new file mode 100644
index 00000000000..178cbb8ae1b
--- /dev/null
+++ b/arch/sparc/lib/atomic_32.S
@@ -0,0 +1,99 @@
+/* atomic.S: Move this stuff here for better ICACHE hit rates.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu)
+ */
+
+#include <asm/ptrace.h>
+#include <asm/psr.h>
+
+	.text
+	.align	4
+
+	.globl  __atomic_begin
+__atomic_begin:
+
+#ifndef CONFIG_SMP
+	.globl	___xchg32_sun4c
+___xchg32_sun4c:
+	rd	%psr, %g3
+	andcc	%g3, PSR_PIL, %g0
+	bne	1f
+	 nop
+	wr	%g3, PSR_PIL, %psr
+	nop; nop; nop
+1:
+	andcc	%g3, PSR_PIL, %g0
+	ld	[%g1], %g7
+	bne	1f
+	 st	%g2, [%g1]
+	wr	%g3, 0x0, %psr
+	nop; nop; nop
+1:
+	mov	%g7, %g2
+	jmpl	%o7 + 8, %g0
+	 mov	%g4, %o7
+
+	.globl	___xchg32_sun4md
+___xchg32_sun4md:
+	swap	[%g1], %g2
+	jmpl	%o7 + 8, %g0
+	 mov	%g4, %o7
+#endif
+
+	/* Read asm-sparc/atomic.h carefully to understand how this works for SMP.
+	 * Really, some things here for SMP are overly clever, go read the header.
+	 */
+	.globl	___atomic24_add
+___atomic24_add:
+	rd	%psr, %g3		! Keep the code small, old way was stupid
+	nop; nop; nop;			! Let the bits set
+	or	%g3, PSR_PIL, %g7	! Disable interrupts
+	wr	%g7, 0x0, %psr		! Set %psr
+	nop; nop; nop;			! Let the bits set
+#ifdef CONFIG_SMP
+1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
+	orcc	%g7, 0x0, %g0		! Did we get it?
+	bne	1b			! Nope...
+	 ld	[%g1], %g7		! Load locked atomic24_t
+	sra	%g7, 8, %g7		! Get signed 24-bit integer
+	add	%g7, %g2, %g2		! Add in argument
+	sll	%g2, 8, %g7		! Transpose back to atomic24_t
+	st	%g7, [%g1]		! Clever: This releases the lock as well.
+#else
+	ld	[%g1], %g7		! Load locked atomic24_t
+	add	%g7, %g2, %g2		! Add in argument
+	st	%g2, [%g1]		! Store it back
+#endif
+	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
+	nop; nop; nop;			! Let the bits set
+	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
+	 mov	%g4, %o7		! Restore %o7
+
+	.globl	___atomic24_sub
+___atomic24_sub:
+	rd	%psr, %g3		! Keep the code small, old way was stupid
+	nop; nop; nop;			! Let the bits set
+	or	%g3, PSR_PIL, %g7	! Disable interrupts
+	wr	%g7, 0x0, %psr		! Set %psr
+	nop; nop; nop;			! Let the bits set
+#ifdef CONFIG_SMP
+1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
+	orcc	%g7, 0x0, %g0		! Did we get it?
+	bne	1b			! Nope...
+	 ld	[%g1], %g7		! Load locked atomic24_t
+	sra	%g7, 8, %g7		! Get signed 24-bit integer
+	sub	%g7, %g2, %g2		! Subtract argument
+	sll	%g2, 8, %g7		! Transpose back to atomic24_t
+	st	%g7, [%g1]		! Clever: This releases the lock as well
+#else
+	ld	[%g1], %g7		! Load locked atomic24_t
+	sub	%g7, %g2, %g2		! Subtract argument
+	st	%g2, [%g1]		! Store it back
+#endif
+	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
+	nop; nop; nop;			! Let the bits set
+	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
+	 mov	%g4, %o7		! Restore %o7
+
+	.globl  __atomic_end
+__atomic_end:
diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S
new file mode 100644
index 00000000000..c45470d0b0c
--- /dev/null
+++ b/arch/sparc/lib/mul.S
@@ -0,0 +1,137 @@
+/*
+ * mul.S:       This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+
+/*
+ * Signed multiply, from Appendix E of the Sparc Version 8
+ * Architecture Manual.
+ */
+
+/*
+ * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
+ * the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.
+ */
+
+	.globl .mul
+	.globl _Mul
+.mul:
+_Mul:	/* needed for export */
+	mov	%o0, %y		! multiplier -> Y
+	andncc	%o0, 0xfff, %g0	! test bits 12..31
+	be	Lmul_shortway	! if zero, can do it the short way
+	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
+
+	/*
+	 * Long multiply.  32 steps, followed by a final shift step.
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %o1, %o4	! 13
+	mulscc	%o4, %o1, %o4	! 14
+	mulscc	%o4, %o1, %o4	! 15
+	mulscc	%o4, %o1, %o4	! 16
+	mulscc	%o4, %o1, %o4	! 17
+	mulscc	%o4, %o1, %o4	! 18
+	mulscc	%o4, %o1, %o4	! 19
+	mulscc	%o4, %o1, %o4	! 20
+	mulscc	%o4, %o1, %o4	! 21
+	mulscc	%o4, %o1, %o4	! 22
+	mulscc	%o4, %o1, %o4	! 23
+	mulscc	%o4, %o1, %o4	! 24
+	mulscc	%o4, %o1, %o4	! 25
+	mulscc	%o4, %o1, %o4	! 26
+	mulscc	%o4, %o1, %o4	! 27
+	mulscc	%o4, %o1, %o4	! 28
+	mulscc	%o4, %o1, %o4	! 29
+	mulscc	%o4, %o1, %o4	! 30
+	mulscc	%o4, %o1, %o4	! 31
+	mulscc	%o4, %o1, %o4	! 32
+	mulscc	%o4, %g0, %o4	! final shift
+
+	! If %o0 was negative, the result is
+	!	(%o0 * %o1) + (%o1 << 32))
+	! We fix that here.
+
+#if 0
+	tst	%o0
+	bge	1f
+	 rd	%y, %o0
+
+	! %o0 was indeed negative; fix upper 32 bits of result by subtracting 
+	! %o1 (i.e., return %o4 - %o1 in %o1).
+	retl
+	 sub	%o4, %o1, %o1
+
+1:
+	retl
+	 mov	%o4, %o1
+#else
+	/* Faster code adapted from tege@sics.se's code for umul.S.  */
+	sra	%o0, 31, %o2	! make mask from sign bit
+	and	%o1, %o2, %o2	! %o2 = 0 or %o1, depending on sign of %o0
+	rd	%y, %o0		! get lower half of product
+	retl
+	 sub	%o4, %o2, %o1	! subtract compensation 
+				!  and put upper half in place
+#endif
+
+Lmul_shortway:
+	/*
+	 * Short multiply.  12 steps, followed by a final shift step.
+	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+	 * but there is no problem with %o0 being negative (unlike above).
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 *  %o4 has 20 of the bits that should be in the low part of the
+	 * result; %y has the bottom 12 (as %y's top 12).  That is:
+	 *
+	 *	  %o4		    %y
+	 * +----------------+----------------+
+	 * | -12- |   -20-  | -12- |   -20-  |
+	 * +------(---------+------)---------+
+	 *  --hi-- ----low-part----
+	 *
+	 * The upper 12 bits of %o4 should be sign-extended to form the
+	 * high part of the product (i.e., highpart = %o4 >> 20).
+	 */
+
+	rd	%y, %o5
+	sll	%o4, 12, %o0	! shift middle bits left 12
+	srl	%o5, 20, %o5	! shift low bits right 20, zero fill at left
+	or	%o5, %o0, %o0	! construct low part of result
+	retl
+	 sra	%o4, 20, %o1	! ... and extract high part of result
+
+	.globl	.mul_patch
+.mul_patch:
+	smul	%o0, %o1, %o0
+	retl
+	 rd	%y, %o1
+	nop
diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S
new file mode 100644
index 00000000000..42fb8625281
--- /dev/null
+++ b/arch/sparc/lib/rem.S
@@ -0,0 +1,384 @@
+/*
+ * rem.S:       This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+
+
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .rem	name of function to generate
+ *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  true		true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+	.globl .rem
+	.globl _Rem
+.rem:
+_Rem:	/* needed for export */
+	! compute sign of result; if neither is negative, no problem
+	orcc	%o1, %o0, %g0	! either negative?
+	bge	2f			! no, go do the divide
+	 mov	%o0, %g2	! compute sign in any case
+
+	tst	%o1
+	bge	1f
+	 tst	%o0
+	! %o1 is definitely negative; %o0 might also be negative
+	bge	2f			! if %o0 not negative...
+	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+1:	! %o0 is negative, %o1 is nonnegative
+	sub	%g0, %o0, %o0	! make %o0 nonnegative
+2:
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	 mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		 clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	Lgot_result		! (and algorithm fails otherwise)
+	 clr	%o2
+
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
+	cmp	%o3, %g1
+	blu	Lnot_really_big
+	 clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		 mov	1, %g7
+
+		sll	%o5, 4, %o5
+
+		b	1b
+		 add	%o4, 1, %o4
+
+	! Now compute %g7.
+	2:
+		addcc	%o5, %o5, %o5
+
+		bcc	Lnot_too_big
+		 add	%g7, 1, %g7
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+
+		b	Ldo_single_div
+		 sub	%g7, 1, %g7
+
+	Lnot_too_big:
+	3:
+		cmp	%o5, %o3
+		blu	2b
+		 nop
+
+		be	Ldo_single_div
+		 nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g7
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	Ldo_single_div:
+		subcc	%g7, 1, %g7
+		bl	Lend_regular_divide
+		 nop
+
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+
+		b	Lend_single_divloop
+		 nop
+	Lsingle_divloop:
+		sll	%o2, 1, %o2
+
+		bl	1f
+		 srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+
+		b	2f
+		 add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	Lend_single_divloop:
+		subcc	%g7, 1, %g7
+		bge	Lsingle_divloop
+		 tst	%o3
+
+		b,a	Lend_regular_divide
+
+Lnot_really_big:
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	 addcc	%o4, 1, %o4
+	be	Lgot_result
+	 sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+Ldivloop:
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	L.1.16
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	L.2.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	L.3.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	L.4.23
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+
+	b	9f
+	 add	%o2, (7*2+1), %o2
+	
+L.4.23:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2-1), %o2
+	
+L.3.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	L.4.21
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2+1), %o2
+	
+L.4.21:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2-1), %o2
+	
+L.2.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	L.3.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	L.4.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
+L.4.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2-1), %o2
+
+L.3.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	L.4.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2+1), %o2
+
+L.4.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
+L.1.16:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	L.2.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	L.3.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	L.4.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
+L.4.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
+L.3.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	L.4.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
+L.4.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
+L.2.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	L.3.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	L.4.11
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+
+L.4.11:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
+
+L.3.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	L.4.9
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
+L.4.9:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
+	9:
+Lend_regular_divide:
+	subcc	%o4, 1, %o4
+	bge	Ldivloop
+	 tst	%o3
+
+	bl,a	Lgot_result
+	! non-restoring fixup here (one instruction only!)
+	add	%o3, %o1, %o3
+
+Lgot_result:
+	! check to see if answer should be < 0
+	tst	%g2
+	bl,a	1f
+	 sub %g0, %o3, %o3
+1:
+	retl
+	 mov %o3, %o0
+
+	.globl	.rem_patch
+.rem_patch:
+	sra	%o0, 0x1f, %o4
+	wr	%o4, 0x0, %y
+	nop
+	nop
+	nop
+	sdivcc	%o0, %o1, %o2
+	bvs,a	1f
+	 xnor	%o2, %g0, %o2
+1:	smul	%o2, %o1, %o2
+	retl
+	 sub	%o0, %o2, %o0
+	nop
diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S
new file mode 100644
index 00000000000..f0a0d4e4db7
--- /dev/null
+++ b/arch/sparc/lib/sdiv.S
@@ -0,0 +1,381 @@
+/*
+ * sdiv.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+
+
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .div	name of function to generate
+ *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  true		true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+	.globl .div
+	.globl _Div
+.div:
+_Div:	/* needed for export */
+	! compute sign of result; if neither is negative, no problem
+	orcc	%o1, %o0, %g0	! either negative?
+	bge	2f			! no, go do the divide
+	 xor	%o1, %o0, %g2	! compute sign in any case
+
+	tst	%o1
+	bge	1f
+	 tst	%o0
+	! %o1 is definitely negative; %o0 might also be negative
+	bge	2f			! if %o0 not negative...
+	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+1:	! %o0 is negative, %o1 is nonnegative
+	sub	%g0, %o0, %o0	! make %o0 nonnegative
+2:
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	 mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		 clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	Lgot_result		! (and algorithm fails otherwise)
+	 clr	%o2
+
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
+	cmp	%o3, %g1
+	blu	Lnot_really_big
+	 clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		 mov	1, %g7
+
+		sll	%o5, 4, %o5
+
+		b	1b
+		 add	%o4, 1, %o4
+
+	! Now compute %g7.
+	2:
+		addcc	%o5, %o5, %o5
+		bcc	Lnot_too_big
+		 add	%g7, 1, %g7
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+
+		b	Ldo_single_div
+		 sub	%g7, 1, %g7
+
+	Lnot_too_big:
+	3:
+		cmp	%o5, %o3
+		blu	2b
+		 nop
+
+		be	Ldo_single_div
+		 nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g7
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	Ldo_single_div:
+		subcc	%g7, 1, %g7
+		bl	Lend_regular_divide
+		 nop
+
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+
+		b	Lend_single_divloop
+		 nop
+	Lsingle_divloop:
+		sll	%o2, 1, %o2
+
+		bl	1f
+		 srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+
+		b	2f
+		 add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	Lend_single_divloop:
+		subcc	%g7, 1, %g7
+		bge	Lsingle_divloop
+		 tst	%o3
+
+		b,a	Lend_regular_divide
+
+Lnot_really_big:
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	 addcc	%o4, 1, %o4
+
+	be	Lgot_result
+	 sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+Ldivloop:
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	L.1.16
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	L.2.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	L.3.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	L.4.23
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2+1), %o2
+
+L.4.23:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2-1), %o2
+
+L.3.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	L.4.21
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2+1), %o2
+
+L.4.21:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2-1), %o2
+
+L.2.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	L.3.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	L.4.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
+L.4.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2-1), %o2
+	
+	
+L.3.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	L.4.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2+1), %o2
+
+L.4.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
+L.1.16:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	L.2.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	L.3.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	L.4.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
+L.4.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
+L.3.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	L.4.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
+L.4.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
+L.2.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	L.3.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	L.4.11
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+
+L.4.11:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
+L.3.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	L.4.9
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
+L.4.9:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
+	9:
+Lend_regular_divide:
+	subcc	%o4, 1, %o4
+	bge	Ldivloop
+	 tst	%o3
+
+	bl,a	Lgot_result
+	! non-restoring fixup here (one instruction only!)
+	sub	%o2, 1, %o2
+
+Lgot_result:
+	! check to see if answer should be < 0
+	tst	%g2
+	bl,a	1f
+	 sub %g0, %o2, %o2
+1:
+	retl
+	 mov %o2, %o0
+
+	.globl	.div_patch
+.div_patch:
+	sra	%o0, 0x1f, %o2
+	wr	%o2, 0x0, %y
+	nop
+	nop
+	nop
+	sdivcc	%o0, %o1, %o0
+	bvs,a	1f
+	 xnor	%o0, %g0, %o0
+1:	retl
+	 nop
diff --git a/arch/sparc/lib/strlen_user_32.S b/arch/sparc/lib/strlen_user_32.S
new file mode 100644
index 00000000000..8c8a371df3c
--- /dev/null
+++ b/arch/sparc/lib/strlen_user_32.S
@@ -0,0 +1,109 @@
+/* strlen_user.S: Sparc optimized strlen_user code
+ *
+ * Return length of string in userspace including terminating 0
+ * or 0 for error
+ *
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+10:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	1f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be	4f
+	 or	%o4, %lo(HI_MAGIC), %o3
+11:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	2f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be	5f
+	 sethi	%hi(LO_MAGIC), %o4
+12:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	3f
+	 add	%o0, 1, %o0
+	b	13f
+	 or	%o4, %lo(LO_MAGIC), %o2
+1:
+	retl
+	 mov	1, %o0
+2:
+	retl
+	 mov	2, %o0
+3:
+	retl
+	 mov	3, %o0
+
+	.align 4
+	.global __strlen_user, __strnlen_user
+__strlen_user:
+	sethi	%hi(32768), %o1
+__strnlen_user:
+	mov	%o1, %g1
+	mov	%o0, %o1
+	andcc	%o0, 3, %g0
+	bne	10b
+	 sethi	%hi(HI_MAGIC), %o4
+	or	%o4, %lo(HI_MAGIC), %o3
+4:
+	sethi	%hi(LO_MAGIC), %o4
+5:
+	or	%o4, %lo(LO_MAGIC), %o2
+13:
+	ld	[%o0], %o5
+2:
+	sub	%o5, %o2, %o4
+	andcc	%o4, %o3, %g0
+	bne	82f
+	 add	%o0, 4, %o0
+	sub	%o0, %o1, %g2
+81:	cmp	%g2, %g1
+	blu	13b
+	 mov	%o0, %o4
+	ba,a	1f
+
+	/* Check every byte. */
+82:	srl	%o5, 24, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o0, -3, %o4
+	srl	%o5, 16, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o4, 1, %o4
+	srl	%o5, 8, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o4, 1, %o4
+	andcc	%o5, 0xff, %g0
+	bne	81b
+	 sub	%o0, %o1, %g2
+
+	add	%o4, 1, %o4
+1:
+	retl
+	 sub	%o4, %o1, %o0
+
+	.section .fixup,#alloc,#execinstr
+	.align	4
+9:
+	retl
+	 clr	%o0
+
+	.section __ex_table,#alloc
+	.align	4
+
+	.word	10b, 9b
+	.word	11b, 9b
+	.word	12b, 9b
+	.word	13b, 9b
diff --git a/arch/sparc/lib/strlen_user_64.S b/arch/sparc/lib/strlen_user_64.S
new file mode 100644
index 00000000000..114ed111e25
--- /dev/null
+++ b/arch/sparc/lib/strlen_user_64.S
@@ -0,0 +1,95 @@
+/* strlen_user.S: Sparc64 optimized strlen_user code
+ *
+ * Return length of string in userspace including terminating 0
+ * or 0 for error
+ *
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996,1999 David S. Miller (davem@redhat.com)
+ * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#include <asm/asi.h>
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+	.align 4
+	.global __strlen_user, __strnlen_user
+__strlen_user:
+	sethi	%hi(32768), %o1
+__strnlen_user:	
+	mov	%o1, %g1
+	mov	%o0, %o1
+	andcc	%o0, 3, %g0
+	be,pt	%icc, 9f
+	 sethi	%hi(HI_MAGIC), %o4
+10:	lduba	[%o0] %asi, %o5
+	brz,pn	%o5, 21f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be,pn	%icc, 4f
+	 or	%o4, %lo(HI_MAGIC), %o3
+11:	lduba	[%o0] %asi, %o5
+	brz,pn	%o5, 22f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be,pt	%icc, 13f
+	 srl	%o3, 7, %o2
+12:	lduba	[%o0] %asi, %o5
+	brz,pn	%o5, 23f
+	 add	%o0, 1, %o0
+	ba,pt	%icc, 2f
+15:	 lda	[%o0] %asi, %o5
+9:	or	%o4, %lo(HI_MAGIC), %o3
+4:	srl	%o3, 7, %o2
+13:	lda	[%o0] %asi, %o5
+2:	sub	%o5, %o2, %o4
+	andcc	%o4, %o3, %g0
+	bne,pn	%icc, 82f
+	 add	%o0, 4, %o0
+	sub	%o0, %o1, %g2
+81:	cmp	%g2, %g1
+	blu,pt	%icc, 13b
+	 mov	%o0, %o4
+	ba,a,pt	%xcc, 1f
+
+	/* Check every byte. */
+82:	srl	%o5, 24, %g7
+	andcc	%g7, 0xff, %g0
+	be,pn	%icc, 1f
+	 add	%o0, -3, %o4
+	srl	%o5, 16, %g7
+	andcc	%g7, 0xff, %g0
+	be,pn	%icc, 1f
+	 add	%o4, 1, %o4
+	srl	%o5, 8, %g7
+	andcc	%g7, 0xff, %g0
+	be,pn	%icc, 1f
+	 add	%o4, 1, %o4
+	andcc	%o5, 0xff, %g0
+	bne,pt	%icc, 81b
+	 sub	%o0, %o1, %g2
+	add	%o4, 1, %o4
+1:	retl
+	 sub	%o4, %o1, %o0
+21:	retl
+	 mov	1, %o0
+22:	retl
+	 mov	2, %o0
+23:	retl
+	 mov	3, %o0
+
+        .section .fixup,#alloc,#execinstr
+        .align  4
+30:
+        retl
+         clr    %o0
+
+	.section __ex_table,"a"
+	.align	4
+
+	.word	10b, 30b
+	.word	11b, 30b
+	.word	12b, 30b
+	.word	15b, 30b
+	.word	13b, 30b
diff --git a/arch/sparc/lib/strncpy_from_user_32.S b/arch/sparc/lib/strncpy_from_user_32.S
new file mode 100644
index 00000000000..d77198976a6
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user_32.S
@@ -0,0 +1,47 @@
+/* strncpy_from_user.S: Sparc strncpy from userspace.
+ *
+ *  Copyright(C) 1996 David S. Miller
+ */
+
+#include <asm/ptrace.h>
+#include <asm/errno.h>
+
+	.text
+	.align	4
+
+	/* Must return:
+	 *
+	 * -EFAULT		for an exception
+	 * count		if we hit the buffer limit
+	 * bytes copied		if we hit a null byte
+	 */
+
+	.globl	__strncpy_from_user
+__strncpy_from_user:
+	/* %o0=dest, %o1=src, %o2=count */
+	mov	%o2, %o3
+1:
+	subcc	%o2, 1, %o2
+	bneg	2f
+	 nop
+10:
+	ldub	[%o1], %o4
+	add	%o0, 1, %o0
+	cmp	%o4, 0
+	add	%o1, 1, %o1
+	bne	1b
+	 stb	%o4, [%o0 - 1]
+2:
+	add	%o2, 1, %o0
+	retl
+	 sub	%o3, %o0, %o0
+
+	.section .fixup,#alloc,#execinstr
+	.align	4
+4:
+	retl
+	 mov	-EFAULT, %o0
+
+	.section __ex_table,#alloc
+	.align	4
+	.word	10b, 4b
diff --git a/arch/sparc/lib/strncpy_from_user_64.S b/arch/sparc/lib/strncpy_from_user_64.S
new file mode 100644
index 00000000000..511c8f136f9
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user_64.S
@@ -0,0 +1,135 @@
+/*
+ * strncpy_from_user.S: Sparc64 strncpy from userspace.
+ *
+ *  Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+
+#include <asm/asi.h>
+#include <asm/errno.h>
+
+	.data
+	.align	8
+0:	.xword	0x0101010101010101
+
+	.text
+	.align	32
+
+	/* Must return:
+	 *
+	 * -EFAULT		for an exception
+	 * count		if we hit the buffer limit
+	 * bytes copied		if we hit a null byte
+	 * (without the null byte)
+	 *
+	 * This implementation assumes:
+	 * %o1 is 8 aligned => !(%o2 & 7)
+	 * %o0 is 8 aligned (if not, it will be slooooow, but will work)
+	 *
+	 * This is optimized for the common case:
+	 * in my stats, 90% of src are 8 aligned (even on sparc32)
+	 * and average length is 18 or so.
+	 */
+
+	.globl	__strncpy_from_user
+	.type	__strncpy_from_user,#function
+__strncpy_from_user:
+	/* %o0=dest, %o1=src, %o2=count */
+	andcc	%o1, 7, %g0		! IEU1	Group
+	bne,pn	%icc, 30f		! CTI
+	 add	%o0, %o2, %g3		! IEU0
+60:	ldxa	[%o1] %asi, %g1		! Load	Group
+	brlez,pn %o2, 10f		! CTI
+	 mov	%o0, %o3		! IEU0
+50:	sethi	%hi(0b), %o4		! IEU0	Group
+	ldx	[%o4 + %lo(0b)], %o4	! Load
+	sllx	%o4, 7, %o5		! IEU1	Group
+1:	sub	%g1, %o4, %g2		! IEU0	Group
+	stx	%g1, [%o0]		! Store
+	add	%o0, 8, %o0		! IEU1
+	andcc	%g2, %o5, %g0		! IEU1	Group
+	bne,pn	%xcc, 5f		! CTI
+	 add	%o1, 8, %o1		! IEU0
+	cmp	%o0, %g3		! IEU1	Group
+	bl,a,pt %xcc, 1b		! CTI
+61:	 ldxa	[%o1] %asi, %g1		! Load
+10:	retl				! CTI	Group
+	 mov	%o2, %o0		! IEU0
+5:	srlx	%g2, 32, %g7		! IEU0	Group
+	sethi	%hi(0xff00), %o4	! IEU1
+	andcc	%g7, %o5, %g0		! IEU1	Group
+	be,pn	%icc, 2f		! CTI
+	 or	%o4, %lo(0xff00), %o4	! IEU0
+	srlx	%g1, 48, %g7		! IEU0	Group
+	andcc	%g7, %o4, %g0		! IEU1	Group
+	be,pn	%icc, 50f		! CTI
+	 andcc	%g7, 0xff, %g0		! IEU1	Group
+	be,pn	%icc, 51f		! CTI
+	 srlx	%g1, 32, %g7		! IEU0
+	andcc	%g7, %o4, %g0		! IEU1	Group
+	be,pn	%icc, 52f		! CTI
+	 andcc	%g7, 0xff, %g0		! IEU1	Group
+	be,pn	%icc, 53f		! CTI
+2:	 andcc	%g2, %o5, %g0		! IEU1	Group
+	be,pn	%icc, 2f		! CTI
+	 srl	%g1, 16, %g7		! IEU0
+	andcc	%g7, %o4, %g0		! IEU1	Group
+	be,pn	%icc, 54f		! CTI
+	 andcc	%g7, 0xff, %g0		! IEU1	Group
+	be,pn	%icc, 55f		! CTI
+	 andcc	%g1, %o4, %g0		! IEU1	Group
+	be,pn	%icc, 56f		! CTI
+	 andcc	%g1, 0xff, %g0		! IEU1	Group
+	be,a,pn	%icc, 57f		! CTI
+	 sub	%o0, %o3, %o0		! IEU0
+2:	cmp	%o0, %g3		! IEU1	Group
+	bl,a,pt	%xcc, 50b		! CTI
+62:	 ldxa	[%o1] %asi, %g1		! Load
+	retl				! CTI	Group
+	 mov	%o2, %o0		! IEU0
+50:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 8, %o0
+51:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 7, %o0
+52:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 6, %o0
+53:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 5, %o0
+54:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 4, %o0
+55:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 3, %o0
+56:	sub	%o0, %o3, %o0
+	retl
+	 sub	%o0, 2, %o0
+57:	retl
+	 sub	%o0, 1, %o0
+30:	brlez,pn %o2, 3f
+	 sub	%g0, %o2, %o3
+	add	%o0, %o2, %o0
+63:	lduba	[%o1] %asi, %o4
+1:	add	%o1, 1, %o1
+	brz,pn	%o4, 2f
+	 stb	%o4, [%o0 + %o3]
+	addcc	%o3, 1, %o3
+	bne,pt	%xcc, 1b
+64:	 lduba	[%o1] %asi, %o4
+3:	retl
+	 mov	%o2, %o0
+2:	retl
+	 add	%o2, %o3, %o0
+	.size	__strncpy_from_user, .-__strncpy_from_user
+
+	.section __ex_table,"a"
+	.align	4
+	.word	60b, __retl_efault
+	.word	61b, __retl_efault
+	.word	62b, __retl_efault
+	.word	63b, __retl_efault
+	.word	64b, __retl_efault
+	.previous
diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S
new file mode 100644
index 00000000000..2101405bdfc
--- /dev/null
+++ b/arch/sparc/lib/udiv.S
@@ -0,0 +1,357 @@
+/*
+ * udiv.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+
+
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .udiv	name of function to generate
+ *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  false		false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+	.globl .udiv
+	.globl _Udiv
+.udiv:
+_Udiv:	/* needed for export */
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	 mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		 clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	Lgot_result		! (and algorithm fails otherwise)
+	 clr	%o2
+
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
+	cmp	%o3, %g1
+	blu	Lnot_really_big
+	 clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		 mov	1, %g7
+
+		sll	%o5, 4, %o5
+
+		b	1b
+		 add	%o4, 1, %o4
+
+	! Now compute %g7.
+	2:
+		addcc	%o5, %o5, %o5
+		bcc	Lnot_too_big
+		 add	%g7, 1, %g7
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+
+		b	Ldo_single_div
+		 sub	%g7, 1, %g7
+
+	Lnot_too_big:
+	3:
+		cmp	%o5, %o3
+		blu	2b
+		 nop
+
+		be	Ldo_single_div
+		 nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g7
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	Ldo_single_div:
+		subcc	%g7, 1, %g7
+		bl	Lend_regular_divide
+		 nop
+
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+
+		b	Lend_single_divloop
+		 nop
+	Lsingle_divloop:
+		sll	%o2, 1, %o2
+		bl	1f
+		 srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		 add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	Lend_single_divloop:
+		subcc	%g7, 1, %g7
+		bge	Lsingle_divloop
+		 tst	%o3
+
+		b,a	Lend_regular_divide
+
+Lnot_really_big:
+1:
+	sll	%o5, 4, %o5
+
+	cmp	%o5, %o3
+	bleu	1b
+	 addcc	%o4, 1, %o4
+
+	be	Lgot_result
+	 sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+Ldivloop:
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	L.1.16
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	L.2.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	L.3.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	L.4.23
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2+1), %o2
+
+L.4.23:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2-1), %o2
+
+L.3.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	L.4.21
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2+1), %o2
+
+L.4.21:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2-1), %o2
+
+L.2.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	L.3.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	L.4.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
+L.4.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2-1), %o2
+
+L.3.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	L.4.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2+1), %o2
+
+L.4.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
+L.1.16:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	L.2.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	L.3.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	L.4.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
+L.4.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
+L.3.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	L.4.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
+L.4.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
+L.2.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	L.3.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	L.4.11
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+
+L.4.11:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
+L.3.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	L.4.9
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
+L.4.9:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
+	9:
+Lend_regular_divide:
+	subcc	%o4, 1, %o4
+	bge	Ldivloop
+	 tst	%o3
+
+	bl,a	Lgot_result
+	! non-restoring fixup here (one instruction only!)
+	sub	%o2, 1, %o2
+
+Lgot_result:
+
+	retl
+	 mov %o2, %o0
+
+	.globl	.udiv_patch
+.udiv_patch:
+	wr	%g0, 0x0, %y
+	nop
+	nop
+	retl
+	 udiv	%o0, %o1, %o0
+	nop
diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S
new file mode 100644
index 00000000000..1f36ae68252
--- /dev/null
+++ b/arch/sparc/lib/umul.S
@@ -0,0 +1,171 @@
+/*
+ * umul.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+
+
+/*
+ * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
+ * upper 32 bits of the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.  Short
+ * multiplies require 25 instruction cycles, and long ones require
+ * 45 instruction cycles.
+ *
+ * On return, overflow has occurred (%o1 is not zero) if and only if
+ * the Z condition code is clear, allowing, e.g., the following:
+ *
+ *	call	.umul
+ *	nop
+ *	bnz	overflow	(or tnz)
+ */
+
+	.globl .umul
+	.globl _Umul
+.umul:
+_Umul:	/* needed for export */
+	or	%o0, %o1, %o4
+	mov	%o0, %y		! multiplier -> Y
+
+	andncc	%o4, 0xfff, %g0	! test bits 12..31 of *both* args
+	be	Lmul_shortway	! if zero, can do it the short way
+	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
+
+	/*
+	 * Long multiply.  32 steps, followed by a final shift step.
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %o1, %o4	! 13
+	mulscc	%o4, %o1, %o4	! 14
+	mulscc	%o4, %o1, %o4	! 15
+	mulscc	%o4, %o1, %o4	! 16
+	mulscc	%o4, %o1, %o4	! 17
+	mulscc	%o4, %o1, %o4	! 18
+	mulscc	%o4, %o1, %o4	! 19
+	mulscc	%o4, %o1, %o4	! 20
+	mulscc	%o4, %o1, %o4	! 21
+	mulscc	%o4, %o1, %o4	! 22
+	mulscc	%o4, %o1, %o4	! 23
+	mulscc	%o4, %o1, %o4	! 24
+	mulscc	%o4, %o1, %o4	! 25
+	mulscc	%o4, %o1, %o4	! 26
+	mulscc	%o4, %o1, %o4	! 27
+	mulscc	%o4, %o1, %o4	! 28
+	mulscc	%o4, %o1, %o4	! 29
+	mulscc	%o4, %o1, %o4	! 30
+	mulscc	%o4, %o1, %o4	! 31
+	mulscc	%o4, %o1, %o4	! 32
+	mulscc	%o4, %g0, %o4	! final shift
+
+
+	/*
+	 * Normally, with the shift-and-add approach, if both numbers are
+	 * positive you get the correct result.  With 32-bit two's-complement
+	 * numbers, -x is represented as
+	 *
+	 *		  x		    32
+	 *	( 2  -  ------ ) mod 2  *  2
+	 *		   32
+	 *		  2
+	 *
+	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
+	 * we can treat this as if the radix point were just to the left
+	 * of the sign bit (multiply by 2^32), and get
+	 *
+	 *	-x  =  (2 - x) mod 2
+	 *
+	 * Then, ignoring the `mod 2's for convenience:
+	 *
+	 *   x *  y	= xy
+	 *  -x *  y	= 2y - xy
+	 *   x * -y	= 2x - xy
+	 *  -x * -y	= 4 - 2x - 2y + xy
+	 *
+	 * For signed multiplies, we subtract (x << 32) from the partial
+	 * product to fix this problem for negative multipliers (see mul.s).
+	 * Because of the way the shift into the partial product is calculated
+	 * (N xor V), this term is automatically removed for the multiplicand,
+	 * so we don't have to adjust.
+	 *
+	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
+	 * and the correction is wrong.  So for unsigned multiplies where the
+	 * high order bit is one, we end up with xy - (y << 32).  To fix it
+	 * we add y << 32.
+	 */
+#if 0
+	tst	%o1
+	bl,a	1f		! if %o1 < 0 (high order bit = 1),
+	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
+
+1:
+	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
+#else
+	/* Faster code from tege@sics.se.  */
+	sra	%o1, 31, %o2	! make mask from sign bit
+	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
+	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
+#endif
+
+Lmul_shortway:
+	/*
+	 * Short multiply.  12 steps, followed by a final shift step.
+	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+	 * but there is no problem with %o0 being negative (unlike above),
+	 * and overflow is impossible (the answer is at most 24 bits long).
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 * %o4 has 20 of the bits that should be in the result; %y has
+	 * the bottom 12 (as %y's top 12).  That is:
+	 *
+	 *	  %o4		    %y
+	 * +----------------+----------------+
+	 * | -12- |   -20-  | -12- |   -20-  |
+	 * +------(---------+------)---------+
+	 *	   -----result-----
+	 *
+	 * The 12 bits of %o4 left of the `result' area are all zero;
+	 * in fact, all top 20 bits of %o4 are zero.
+	 */
+
+	rd	%y, %o5
+	sll	%o4, 12, %o0	! shift middle bits left 12
+	srl	%o5, 20, %o5	! shift low bits right 20
+	or	%o5, %o0, %o0
+	retl
+	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
+
+	.globl	.umul_patch
+.umul_patch:
+	umul	%o0, %o1, %o0
+	retl
+	 rd	%y, %o1
+	nop
diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S
new file mode 100644
index 00000000000..77123eb83c4
--- /dev/null
+++ b/arch/sparc/lib/urem.S
@@ -0,0 +1,357 @@
+/*
+ * urem.S:      This routine was taken from glibc-1.09 and is covered
+ *              by the GNU Library General Public License Version 2.
+ */
+
+/* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .urem	name of function to generate
+ *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  false		false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+	.globl .urem
+	.globl _Urem
+.urem:
+_Urem:	/* needed for export */
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	 mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		 clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	Lgot_result		! (and algorithm fails otherwise)
+	 clr	%o2
+
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+
+	cmp	%o3, %g1
+	blu	Lnot_really_big
+	 clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		 mov	1, %g7
+
+		sll	%o5, 4, %o5
+
+		b	1b
+		 add	%o4, 1, %o4
+
+	! Now compute %g7.
+	2:
+		addcc	%o5, %o5, %o5
+		bcc	Lnot_too_big
+		 add	%g7, 1, %g7
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+
+		b	Ldo_single_div
+		 sub	%g7, 1, %g7
+
+	Lnot_too_big:
+	3:
+		cmp	%o5, %o3
+		blu	2b
+		 nop
+
+		be	Ldo_single_div
+		 nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g7
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	Ldo_single_div:
+		subcc	%g7, 1, %g7
+		bl	Lend_regular_divide
+		 nop
+
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+
+		b	Lend_single_divloop
+		 nop
+	Lsingle_divloop:
+		sll	%o2, 1, %o2
+		bl	1f
+		 srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		 add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	Lend_single_divloop:
+		subcc	%g7, 1, %g7
+		bge	Lsingle_divloop
+		 tst	%o3
+
+		b,a	Lend_regular_divide
+
+Lnot_really_big:
+1:
+	sll	%o5, 4, %o5
+
+	cmp	%o5, %o3
+	bleu	1b
+	 addcc	%o4, 1, %o4
+
+	be	Lgot_result
+	 sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+Ldivloop:
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	L.1.16
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	L.2.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	L.3.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	L.4.23
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2+1), %o2
+
+L.4.23:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (7*2-1), %o2
+
+L.3.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	L.4.21
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2+1), %o2
+
+L.4.21:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (5*2-1), %o2
+
+L.2.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	L.3.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	L.4.19
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2+1), %o2
+
+L.4.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (3*2-1), %o2
+
+L.3.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	L.4.17
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2+1), %o2
+	
+L.4.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (1*2-1), %o2
+
+L.1.16:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	L.2.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	L.3.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	L.4.15
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2+1), %o2
+
+L.4.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-1*2-1), %o2
+
+L.3.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	L.4.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2+1), %o2
+
+L.4.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-3*2-1), %o2
+
+L.2.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	L.3.13
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	L.4.11
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2+1), %o2
+	
+L.4.11:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-5*2-1), %o2
+
+L.3.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	L.4.9
+	 srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2+1), %o2
+
+L.4.9:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	 add	%o2, (-7*2-1), %o2
+
+	9:
+Lend_regular_divide:
+	subcc	%o4, 1, %o4
+	bge	Ldivloop
+	 tst	%o3
+
+	bl,a	Lgot_result
+	! non-restoring fixup here (one instruction only!)
+	add	%o3, %o1, %o3
+
+Lgot_result:
+
+	retl
+	 mov %o3, %o0
+
+	.globl	.urem_patch
+.urem_patch:
+	wr	%g0, 0x0, %y
+	nop
+	nop
+	nop
+	udiv	%o0, %o1, %o2
+	umul	%o2, %o1, %o2
+	retl
+	 sub	%o0, %o2, %o0
-- 
cgit v1.2.2