1 files changed, 120 insertions, 0 deletions
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S
new file mode 100644
index 000000000000..6895c0225b85
--- /dev/null
+++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
+        .section        .text..SHmedia32,"ax"
+        .align  2
+        .global __udivdi3
+__udivdi3:
+        shlri r3,1,r4
+        nsb r4,r22
+        shlld r3,r22,r6
+        shlri r6,49,r5
+        movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+        sub r21,r5,r1
+        mmulfx.w r1,r1,r4
+        mshflo.w r1,r63,r1
+        sub r63,r22,r20 // r63 == 64 % 64
+        mmulfx.w r5,r4,r4
+        pta large_divisor,tr0
+        addi r20,32,r9
+        msub.w r1,r4,r1
+        madd.w r1,r1,r1
+        mmulfx.w r1,r1,r4
+        shlri r6,32,r7
+        bgt/u r9,r63,tr0 // large_divisor
+        mmulfx.w r5,r4,r4
+        shlri r2,32+14,r19
+        addi r22,-31,r0
+        msub.w r1,r4,r1
+        mulu.l r1,r7,r4
+        addi r1,-3,r5
+        mulu.l r5,r19,r5
+        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+                         the case may be, %0000000000000000 000.11111111111, still */
+        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+        mulu.l r5,r3,r8
+        mshalds.l r1,r21,r1
+        shari r4,26,r4
+        shlld r8,r0,r8
+        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+        sub r2,r8,r2
+        /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+        shlri r2,22,r21
+        mulu.l r21,r1,r21
+        shlld r5,r0,r8
+        addi r20,30-22,r0
+        shlrd r21,r0,r21
+        mulu.l r21,r3,r5
+        add r8,r21,r8
+        mcmpgt.l r21,r63,r21 // See Note 1
+        addi r20,30,r0
+        mshfhi.l r63,r21,r21
+        sub r2,r5,r2
+        andc r2,r21,r2
+        /* small divisor: need a third divide step */
+        mulu.l r2,r1,r7
+        ptabs r18,tr0
+        addi r2,1,r2
+        shlrd r7,r0,r7
+        mulu.l r7,r3,r5
+        add r8,r7,r8
+        sub r2,r3,r2
+        cmpgt r2,r5,r5
+        add r8,r5,r2
+        /* could test r3 here to check for divide by zero.  */
+        blink tr0,r63
+large_divisor:
+        mmulfx.w r5,r4,r4
+        shlrd r2,r9,r25
+        shlri r25,32,r8
+        msub.w r1,r4,r1
+        mulu.l r1,r7,r4
+        addi r1,-3,r5
+        mulu.l r5,r8,r5
+        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+                         the case may be, %0000000000000000 000.11111111111, still */
+        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+        shlri r5,14-1,r8
+        mulu.l r8,r7,r5
+        mshalds.l r1,r21,r1
+        shari r4,26,r4
+        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+        sub r25,r5,r25
+        /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+        shlri r25,22,r21
+        mulu.l r21,r1,r21
+        pta no_lo_adj,tr0
+        addi r22,32,r0
+        shlri r21,40,r21
+        mulu.l r21,r7,r5
+        add r8,r21,r8
+        shlld r2,r0,r2
+        sub r25,r5,r25
+        bgtu/u r7,r25,tr0 // no_lo_adj
+        addi r8,1,r8
+        sub r25,r7,r25
+no_lo_adj:
+        mextr4 r2,r25,r2
+        /* large_divisor: only needs a few adjustments.  */
+        mulu.l r8,r6,r5
+        ptabs r18,tr0
+        /* bubble */
+        cmpgtu r5,r2,r5
+        sub r8,r5,r2
+        blink tr0,r63
+        
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */

diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S new file mode 100644 index 000000000000..6895c0225b85 --- /dev/null +++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
	1	.section .text..SHmedia32,"ax"
	2	.align 2
	3	.global __udivdi3
	4	__udivdi3:
	5	shlri r3,1,r4
	6	nsb r4,r22
	7	shlld r3,r22,r6
	8	shlri r6,49,r5
	9	movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
	10	sub r21,r5,r1
	11	mmulfx.w r1,r1,r4
	12	mshflo.w r1,r63,r1
	13	sub r63,r22,r20 // r63 == 64 % 64
	14	mmulfx.w r5,r4,r4
	15	pta large_divisor,tr0
	16	addi r20,32,r9
	17	msub.w r1,r4,r1
	18	madd.w r1,r1,r1
	19	mmulfx.w r1,r1,r4
	20	shlri r6,32,r7
	21	bgt/u r9,r63,tr0 // large_divisor
	22	mmulfx.w r5,r4,r4
	23	shlri r2,32+14,r19
	24	addi r22,-31,r0
	25	msub.w r1,r4,r1
	26
	27	mulu.l r1,r7,r4
	28	addi r1,-3,r5
	29	mulu.l r5,r19,r5
	30	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
	31	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
	32	the case may be, %0000000000000000 000.11111111111, still */
	33	muls.l r1,r4,r4 /* leaving at least one sign bit. */
	34	mulu.l r5,r3,r8
	35	mshalds.l r1,r21,r1
	36	shari r4,26,r4
	37	shlld r8,r0,r8
	38	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
	39	sub r2,r8,r2
	40	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
	41
	42	shlri r2,22,r21
	43	mulu.l r21,r1,r21
	44	shlld r5,r0,r8
	45	addi r20,30-22,r0
	46	shlrd r21,r0,r21
	47	mulu.l r21,r3,r5
	48	add r8,r21,r8
	49	mcmpgt.l r21,r63,r21 // See Note 1
	50	addi r20,30,r0
	51	mshfhi.l r63,r21,r21
	52	sub r2,r5,r2
	53	andc r2,r21,r2
	54
	55	/* small divisor: need a third divide step */
	56	mulu.l r2,r1,r7
	57	ptabs r18,tr0
	58	addi r2,1,r2
	59	shlrd r7,r0,r7
	60	mulu.l r7,r3,r5
	61	add r8,r7,r8
	62	sub r2,r3,r2
	63	cmpgt r2,r5,r5
	64	add r8,r5,r2
	65	/* could test r3 here to check for divide by zero. */
	66	blink tr0,r63
	67
	68	large_divisor:
	69	mmulfx.w r5,r4,r4
	70	shlrd r2,r9,r25
	71	shlri r25,32,r8
	72	msub.w r1,r4,r1
	73
	74	mulu.l r1,r7,r4
	75	addi r1,-3,r5
	76	mulu.l r5,r8,r5
	77	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
	78	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
	79	the case may be, %0000000000000000 000.11111111111, still */
	80	muls.l r1,r4,r4 /* leaving at least one sign bit. */
	81	shlri r5,14-1,r8
	82	mulu.l r8,r7,r5
	83	mshalds.l r1,r21,r1
	84	shari r4,26,r4
	85	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
	86	sub r25,r5,r25
	87	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
	88
	89	shlri r25,22,r21
	90	mulu.l r21,r1,r21
	91	pta no_lo_adj,tr0
	92	addi r22,32,r0
	93	shlri r21,40,r21
	94	mulu.l r21,r7,r5
	95	add r8,r21,r8
	96	shlld r2,r0,r2
	97	sub r25,r5,r25
	98	bgtu/u r7,r25,tr0 // no_lo_adj
	99	addi r8,1,r8
	100	sub r25,r7,r25
	101	no_lo_adj:
	102	mextr4 r2,r25,r2
	103
	104	/* large_divisor: only needs a few adjustments. */
	105	mulu.l r8,r6,r5
	106	ptabs r18,tr0
	107	/* bubble */
	108	cmpgtu r5,r2,r5
	109	sub r8,r5,r2
	110	blink tr0,r63
	111
	112	/* Note 1: To shift the result of the second divide stage so that the result
	113	always fits into 32 bits, yet we still reduce the rest sufficiently
	114	would require a lot of instructions to do the shifts just right. Using
	115	the full 64 bit shift result to multiply with the divisor would require
	116	four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
	117	Fortunately, if the upper 32 bits of the shift result are nonzero, we
	118	know that the rest after taking this partial result into account will
	119	fit into 32 bits. So we just clear the upper 32 bits of the rest if the
	120	upper 32 bits of the partial result are nonzero. */