From fcc9d2e5a6c89d22b8b773a64fb4ad21ac318446 Mon Sep 17 00:00:00 2001 From: Jonathan Herman Date: Tue, 22 Jan 2013 10:38:37 -0500 Subject: Added missing tegra files. --- arch/sparc/lib/atomic_32.S | 99 +++++++++ arch/sparc/lib/mul.S | 137 ++++++++++++ arch/sparc/lib/rem.S | 384 ++++++++++++++++++++++++++++++++++ arch/sparc/lib/sdiv.S | 381 +++++++++++++++++++++++++++++++++ arch/sparc/lib/strlen_user_32.S | 109 ++++++++++ arch/sparc/lib/strlen_user_64.S | 95 +++++++++ arch/sparc/lib/strncpy_from_user_32.S | 47 +++++ arch/sparc/lib/strncpy_from_user_64.S | 135 ++++++++++++ arch/sparc/lib/udiv.S | 357 +++++++++++++++++++++++++++++++ arch/sparc/lib/umul.S | 171 +++++++++++++++ arch/sparc/lib/urem.S | 357 +++++++++++++++++++++++++++++++ 11 files changed, 2272 insertions(+) create mode 100644 arch/sparc/lib/atomic_32.S create mode 100644 arch/sparc/lib/mul.S create mode 100644 arch/sparc/lib/rem.S create mode 100644 arch/sparc/lib/sdiv.S create mode 100644 arch/sparc/lib/strlen_user_32.S create mode 100644 arch/sparc/lib/strlen_user_64.S create mode 100644 arch/sparc/lib/strncpy_from_user_32.S create mode 100644 arch/sparc/lib/strncpy_from_user_64.S create mode 100644 arch/sparc/lib/udiv.S create mode 100644 arch/sparc/lib/umul.S create mode 100644 arch/sparc/lib/urem.S (limited to 'arch/sparc/lib') diff --git a/arch/sparc/lib/atomic_32.S b/arch/sparc/lib/atomic_32.S new file mode 100644 index 00000000000..178cbb8ae1b --- /dev/null +++ b/arch/sparc/lib/atomic_32.S @@ -0,0 +1,99 @@ +/* atomic.S: Move this stuff here for better ICACHE hit rates. + * + * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu) + */ + +#include +#include + + .text + .align 4 + + .globl __atomic_begin +__atomic_begin: + +#ifndef CONFIG_SMP + .globl ___xchg32_sun4c +___xchg32_sun4c: + rd %psr, %g3 + andcc %g3, PSR_PIL, %g0 + bne 1f + nop + wr %g3, PSR_PIL, %psr + nop; nop; nop +1: + andcc %g3, PSR_PIL, %g0 + ld [%g1], %g7 + bne 1f + st %g2, [%g1] + wr %g3, 0x0, %psr + nop; nop; nop +1: + mov %g7, %g2 + jmpl %o7 + 8, %g0 + mov %g4, %o7 + + .globl ___xchg32_sun4md +___xchg32_sun4md: + swap [%g1], %g2 + jmpl %o7 + 8, %g0 + mov %g4, %o7 +#endif + + /* Read asm-sparc/atomic.h carefully to understand how this works for SMP. + * Really, some things here for SMP are overly clever, go read the header. + */ + .globl ___atomic24_add +___atomic24_add: + rd %psr, %g3 ! Keep the code small, old way was stupid + nop; nop; nop; ! Let the bits set + or %g3, PSR_PIL, %g7 ! Disable interrupts + wr %g7, 0x0, %psr ! Set %psr + nop; nop; nop; ! Let the bits set +#ifdef CONFIG_SMP +1: ldstub [%g1 + 3], %g7 ! Spin on the byte lock for SMP. + orcc %g7, 0x0, %g0 ! Did we get it? + bne 1b ! Nope... + ld [%g1], %g7 ! Load locked atomic24_t + sra %g7, 8, %g7 ! Get signed 24-bit integer + add %g7, %g2, %g2 ! Add in argument + sll %g2, 8, %g7 ! Transpose back to atomic24_t + st %g7, [%g1] ! Clever: This releases the lock as well. +#else + ld [%g1], %g7 ! Load locked atomic24_t + add %g7, %g2, %g2 ! Add in argument + st %g2, [%g1] ! Store it back +#endif + wr %g3, 0x0, %psr ! Restore original PSR_PIL + nop; nop; nop; ! Let the bits set + jmpl %o7, %g0 ! NOTE: not + 8, see callers in atomic.h + mov %g4, %o7 ! Restore %o7 + + .globl ___atomic24_sub +___atomic24_sub: + rd %psr, %g3 ! Keep the code small, old way was stupid + nop; nop; nop; ! Let the bits set + or %g3, PSR_PIL, %g7 ! Disable interrupts + wr %g7, 0x0, %psr ! Set %psr + nop; nop; nop; ! Let the bits set +#ifdef CONFIG_SMP +1: ldstub [%g1 + 3], %g7 ! Spin on the byte lock for SMP. + orcc %g7, 0x0, %g0 ! Did we get it? + bne 1b ! Nope... + ld [%g1], %g7 ! Load locked atomic24_t + sra %g7, 8, %g7 ! Get signed 24-bit integer + sub %g7, %g2, %g2 ! Subtract argument + sll %g2, 8, %g7 ! Transpose back to atomic24_t + st %g7, [%g1] ! Clever: This releases the lock as well +#else + ld [%g1], %g7 ! Load locked atomic24_t + sub %g7, %g2, %g2 ! Subtract argument + st %g2, [%g1] ! Store it back +#endif + wr %g3, 0x0, %psr ! Restore original PSR_PIL + nop; nop; nop; ! Let the bits set + jmpl %o7, %g0 ! NOTE: not + 8, see callers in atomic.h + mov %g4, %o7 ! Restore %o7 + + .globl __atomic_end +__atomic_end: diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S new file mode 100644 index 00000000000..c45470d0b0c --- /dev/null +++ b/arch/sparc/lib/mul.S @@ -0,0 +1,137 @@ +/* + * mul.S: This routine was taken from glibc-1.09 and is covered + * by the GNU Library General Public License Version 2. + */ + +/* + * Signed multiply, from Appendix E of the Sparc Version 8 + * Architecture Manual. + */ + +/* + * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of + * the 64-bit product). + * + * This code optimizes short (less than 13-bit) multiplies. + */ + + .globl .mul + .globl _Mul +.mul: +_Mul: /* needed for export */ + mov %o0, %y ! multiplier -> Y + andncc %o0, 0xfff, %g0 ! test bits 12..31 + be Lmul_shortway ! if zero, can do it the short way + andcc %g0, %g0, %o4 ! zero the partial product and clear N and V + + /* + * Long multiply. 32 steps, followed by a final shift step. + */ + mulscc %o4, %o1, %o4 ! 1 + mulscc %o4, %o1, %o4 ! 2 + mulscc %o4, %o1, %o4 ! 3 + mulscc %o4, %o1, %o4 ! 4 + mulscc %o4, %o1, %o4 ! 5 + mulscc %o4, %o1, %o4 ! 6 + mulscc %o4, %o1, %o4 ! 7 + mulscc %o4, %o1, %o4 ! 8 + mulscc %o4, %o1, %o4 ! 9 + mulscc %o4, %o1, %o4 ! 10 + mulscc %o4, %o1, %o4 ! 11 + mulscc %o4, %o1, %o4 ! 12 + mulscc %o4, %o1, %o4 ! 13 + mulscc %o4, %o1, %o4 ! 14 + mulscc %o4, %o1, %o4 ! 15 + mulscc %o4, %o1, %o4 ! 16 + mulscc %o4, %o1, %o4 ! 17 + mulscc %o4, %o1, %o4 ! 18 + mulscc %o4, %o1, %o4 ! 19 + mulscc %o4, %o1, %o4 ! 20 + mulscc %o4, %o1, %o4 ! 21 + mulscc %o4, %o1, %o4 ! 22 + mulscc %o4, %o1, %o4 ! 23 + mulscc %o4, %o1, %o4 ! 24 + mulscc %o4, %o1, %o4 ! 25 + mulscc %o4, %o1, %o4 ! 26 + mulscc %o4, %o1, %o4 ! 27 + mulscc %o4, %o1, %o4 ! 28 + mulscc %o4, %o1, %o4 ! 29 + mulscc %o4, %o1, %o4 ! 30 + mulscc %o4, %o1, %o4 ! 31 + mulscc %o4, %o1, %o4 ! 32 + mulscc %o4, %g0, %o4 ! final shift + + ! If %o0 was negative, the result is + ! (%o0 * %o1) + (%o1 << 32)) + ! We fix that here. + +#if 0 + tst %o0 + bge 1f + rd %y, %o0 + + ! %o0 was indeed negative; fix upper 32 bits of result by subtracting + ! %o1 (i.e., return %o4 - %o1 in %o1). + retl + sub %o4, %o1, %o1 + +1: + retl + mov %o4, %o1 +#else + /* Faster code adapted from tege@sics.se's code for umul.S. */ + sra %o0, 31, %o2 ! make mask from sign bit + and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0 + rd %y, %o0 ! get lower half of product + retl + sub %o4, %o2, %o1 ! subtract compensation + ! and put upper half in place +#endif + +Lmul_shortway: + /* + * Short multiply. 12 steps, followed by a final shift step. + * The resulting bits are off by 12 and (32-12) = 20 bit positions, + * but there is no problem with %o0 being negative (unlike above). + */ + mulscc %o4, %o1, %o4 ! 1 + mulscc %o4, %o1, %o4 ! 2 + mulscc %o4, %o1, %o4 ! 3 + mulscc %o4, %o1, %o4 ! 4 + mulscc %o4, %o1, %o4 ! 5 + mulscc %o4, %o1, %o4 ! 6 + mulscc %o4, %o1, %o4 ! 7 + mulscc %o4, %o1, %o4 ! 8 + mulscc %o4, %o1, %o4 ! 9 + mulscc %o4, %o1, %o4 ! 10 + mulscc %o4, %o1, %o4 ! 11 + mulscc %o4, %o1, %o4 ! 12 + mulscc %o4, %g0, %o4 ! final shift + + /* + * %o4 has 20 of the bits that should be in the low part of the + * result; %y has the bottom 12 (as %y's top 12). That is: + * + * %o4 %y + * +----------------+----------------+ + * | -12- | -20- | -12- | -20- | + * +------(---------+------)---------+ + * --hi-- ----low-part---- + * + * The upper 12 bits of %o4 should be sign-extended to form the + * high part of the product (i.e., highpart = %o4 >> 20). + */ + + rd %y, %o5 + sll %o4, 12, %o0 ! shift middle bits left 12 + srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left + or %o5, %o0, %o0 ! construct low part of result + retl + sra %o4, 20, %o1 ! ... and extract high part of result + + .globl .mul_patch +.mul_patch: + smul %o0, %o1, %o0 + retl + rd %y, %o1 + nop diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S new file mode 100644 index 00000000000..42fb8625281 --- /dev/null +++ b/arch/sparc/lib/rem.S @@ -0,0 +1,384 @@ +/* + * rem.S: This routine was taken from glibc-1.09 and is covered + * by the GNU Library General Public License Version 2. + */ + + +/* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .rem name of function to generate + * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 + * true true=true => signed; true=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + .globl .rem + .globl _Rem +.rem: +_Rem: /* needed for export */ + ! compute sign of result; if neither is negative, no problem + orcc %o1, %o0, %g0 ! either negative? + bge 2f ! no, go do the divide + mov %o0, %g2 ! compute sign in any case + + tst %o1 + bge 1f + tst %o0 + ! %o1 is definitely negative; %o0 might also be negative + bge 2f ! if %o0 not negative... + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg +1: ! %o0 is negative, %o1 is nonnegative + sub %g0, %o0, %o0 ! make %o0 nonnegative +2: + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu Lgot_result ! (and algorithm fails otherwise) + clr %o2 + + sethi %hi(1 << (32 - 4 - 1)), %g1 + + cmp %o3, %g1 + blu Lnot_really_big + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g7 + + sll %o5, 4, %o5 + + b 1b + add %o4, 1, %o4 + + ! Now compute %g7. + 2: + addcc %o5, %o5, %o5 + + bcc Lnot_too_big + add %g7, 1, %g7 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + + b Ldo_single_div + sub %g7, 1, %g7 + + Lnot_too_big: + 3: + cmp %o5, %o3 + blu 2b + nop + + be Ldo_single_div + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g7 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + Ldo_single_div: + subcc %g7, 1, %g7 + bl Lend_regular_divide + nop + + sub %o3, %o5, %o3 + mov 1, %o2 + + b Lend_single_divloop + nop + Lsingle_divloop: + sll %o2, 1, %o2 + + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + Lend_single_divloop: + subcc %g7, 1, %g7 + bge Lsingle_divloop + tst %o3 + + b,a Lend_regular_divide + +Lnot_really_big: +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be Lgot_result + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +Ldivloop: + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl L.1.16 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl L.2.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl L.3.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl L.4.23 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + + b 9f + add %o2, (7*2+1), %o2 + +L.4.23: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + +L.3.19: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl L.4.21 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +L.4.21: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + +L.2.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl L.3.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl L.4.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +L.4.19: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + +L.3.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl L.4.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +L.4.17: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + +L.1.16: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl L.2.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl L.3.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl L.4.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +L.4.15: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + +L.3.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl L.4.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +L.4.13: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + +L.2.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl L.3.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl L.4.11 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +L.4.11: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + + +L.3.13: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl L.4.9 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +L.4.9: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + 9: +Lend_regular_divide: + subcc %o4, 1, %o4 + bge Ldivloop + tst %o3 + + bl,a Lgot_result + ! non-restoring fixup here (one instruction only!) + add %o3, %o1, %o3 + +Lgot_result: + ! check to see if answer should be < 0 + tst %g2 + bl,a 1f + sub %g0, %o3, %o3 +1: + retl + mov %o3, %o0 + + .globl .rem_patch +.rem_patch: + sra %o0, 0x1f, %o4 + wr %o4, 0x0, %y + nop + nop + nop + sdivcc %o0, %o1, %o2 + bvs,a 1f + xnor %o2, %g0, %o2 +1: smul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 + nop diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S new file mode 100644 index 00000000000..f0a0d4e4db7 --- /dev/null +++ b/arch/sparc/lib/sdiv.S @@ -0,0 +1,381 @@ +/* + * sdiv.S: This routine was taken from glibc-1.09 and is covered + * by the GNU Library General Public License Version 2. + */ + + +/* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .div name of function to generate + * div div=div => %o0 / %o1; div=rem => %o0 % %o1 + * true true=true => signed; true=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + .globl .div + .globl _Div +.div: +_Div: /* needed for export */ + ! compute sign of result; if neither is negative, no problem + orcc %o1, %o0, %g0 ! either negative? + bge 2f ! no, go do the divide + xor %o1, %o0, %g2 ! compute sign in any case + + tst %o1 + bge 1f + tst %o0 + ! %o1 is definitely negative; %o0 might also be negative + bge 2f ! if %o0 not negative... + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg +1: ! %o0 is negative, %o1 is nonnegative + sub %g0, %o0, %o0 ! make %o0 nonnegative +2: + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu Lgot_result ! (and algorithm fails otherwise) + clr %o2 + + sethi %hi(1 << (32 - 4 - 1)), %g1 + + cmp %o3, %g1 + blu Lnot_really_big + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g7 + + sll %o5, 4, %o5 + + b 1b + add %o4, 1, %o4 + + ! Now compute %g7. + 2: + addcc %o5, %o5, %o5 + bcc Lnot_too_big + add %g7, 1, %g7 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + + b Ldo_single_div + sub %g7, 1, %g7 + + Lnot_too_big: + 3: + cmp %o5, %o3 + blu 2b + nop + + be Ldo_single_div + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g7 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + Ldo_single_div: + subcc %g7, 1, %g7 + bl Lend_regular_divide + nop + + sub %o3, %o5, %o3 + mov 1, %o2 + + b Lend_single_divloop + nop + Lsingle_divloop: + sll %o2, 1, %o2 + + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + Lend_single_divloop: + subcc %g7, 1, %g7 + bge Lsingle_divloop + tst %o3 + + b,a Lend_regular_divide + +Lnot_really_big: +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + + be Lgot_result + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +Ldivloop: + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl L.1.16 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl L.2.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl L.3.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl L.4.23 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +L.4.23: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + +L.3.19: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl L.4.21 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +L.4.21: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + +L.2.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl L.3.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl L.4.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +L.4.19: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + + +L.3.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl L.4.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +L.4.17: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + +L.1.16: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl L.2.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl L.3.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl L.4.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +L.4.15: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + +L.3.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl L.4.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +L.4.13: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + +L.2.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl L.3.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl L.4.11 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +L.4.11: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + +L.3.13: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl L.4.9 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +L.4.9: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + 9: +Lend_regular_divide: + subcc %o4, 1, %o4 + bge Ldivloop + tst %o3 + + bl,a Lgot_result + ! non-restoring fixup here (one instruction only!) + sub %o2, 1, %o2 + +Lgot_result: + ! check to see if answer should be < 0 + tst %g2 + bl,a 1f + sub %g0, %o2, %o2 +1: + retl + mov %o2, %o0 + + .globl .div_patch +.div_patch: + sra %o0, 0x1f, %o2 + wr %o2, 0x0, %y + nop + nop + nop + sdivcc %o0, %o1, %o0 + bvs,a 1f + xnor %o0, %g0, %o0 +1: retl + nop diff --git a/arch/sparc/lib/strlen_user_32.S b/arch/sparc/lib/strlen_user_32.S new file mode 100644 index 00000000000..8c8a371df3c --- /dev/null +++ b/arch/sparc/lib/strlen_user_32.S @@ -0,0 +1,109 @@ +/* strlen_user.S: Sparc optimized strlen_user code + * + * Return length of string in userspace including terminating 0 + * or 0 for error + * + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + +10: + ldub [%o0], %o5 + cmp %o5, 0 + be 1f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be 4f + or %o4, %lo(HI_MAGIC), %o3 +11: + ldub [%o0], %o5 + cmp %o5, 0 + be 2f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be 5f + sethi %hi(LO_MAGIC), %o4 +12: + ldub [%o0], %o5 + cmp %o5, 0 + be 3f + add %o0, 1, %o0 + b 13f + or %o4, %lo(LO_MAGIC), %o2 +1: + retl + mov 1, %o0 +2: + retl + mov 2, %o0 +3: + retl + mov 3, %o0 + + .align 4 + .global __strlen_user, __strnlen_user +__strlen_user: + sethi %hi(32768), %o1 +__strnlen_user: + mov %o1, %g1 + mov %o0, %o1 + andcc %o0, 3, %g0 + bne 10b + sethi %hi(HI_MAGIC), %o4 + or %o4, %lo(HI_MAGIC), %o3 +4: + sethi %hi(LO_MAGIC), %o4 +5: + or %o4, %lo(LO_MAGIC), %o2 +13: + ld [%o0], %o5 +2: + sub %o5, %o2, %o4 + andcc %o4, %o3, %g0 + bne 82f + add %o0, 4, %o0 + sub %o0, %o1, %g2 +81: cmp %g2, %g1 + blu 13b + mov %o0, %o4 + ba,a 1f + + /* Check every byte. */ +82: srl %o5, 24, %g5 + andcc %g5, 0xff, %g0 + be 1f + add %o0, -3, %o4 + srl %o5, 16, %g5 + andcc %g5, 0xff, %g0 + be 1f + add %o4, 1, %o4 + srl %o5, 8, %g5 + andcc %g5, 0xff, %g0 + be 1f + add %o4, 1, %o4 + andcc %o5, 0xff, %g0 + bne 81b + sub %o0, %o1, %g2 + + add %o4, 1, %o4 +1: + retl + sub %o4, %o1, %o0 + + .section .fixup,#alloc,#execinstr + .align 4 +9: + retl + clr %o0 + + .section __ex_table,#alloc + .align 4 + + .word 10b, 9b + .word 11b, 9b + .word 12b, 9b + .word 13b, 9b diff --git a/arch/sparc/lib/strlen_user_64.S b/arch/sparc/lib/strlen_user_64.S new file mode 100644 index 00000000000..114ed111e25 --- /dev/null +++ b/arch/sparc/lib/strlen_user_64.S @@ -0,0 +1,95 @@ +/* strlen_user.S: Sparc64 optimized strlen_user code + * + * Return length of string in userspace including terminating 0 + * or 0 for error + * + * Copyright (C) 1991,1996 Free Software Foundation + * Copyright (C) 1996,1999 David S. Miller (davem@redhat.com) + * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#include + +#define LO_MAGIC 0x01010101 +#define HI_MAGIC 0x80808080 + + .align 4 + .global __strlen_user, __strnlen_user +__strlen_user: + sethi %hi(32768), %o1 +__strnlen_user: + mov %o1, %g1 + mov %o0, %o1 + andcc %o0, 3, %g0 + be,pt %icc, 9f + sethi %hi(HI_MAGIC), %o4 +10: lduba [%o0] %asi, %o5 + brz,pn %o5, 21f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be,pn %icc, 4f + or %o4, %lo(HI_MAGIC), %o3 +11: lduba [%o0] %asi, %o5 + brz,pn %o5, 22f + add %o0, 1, %o0 + andcc %o0, 3, %g0 + be,pt %icc, 13f + srl %o3, 7, %o2 +12: lduba [%o0] %asi, %o5 + brz,pn %o5, 23f + add %o0, 1, %o0 + ba,pt %icc, 2f +15: lda [%o0] %asi, %o5 +9: or %o4, %lo(HI_MAGIC), %o3 +4: srl %o3, 7, %o2 +13: lda [%o0] %asi, %o5 +2: sub %o5, %o2, %o4 + andcc %o4, %o3, %g0 + bne,pn %icc, 82f + add %o0, 4, %o0 + sub %o0, %o1, %g2 +81: cmp %g2, %g1 + blu,pt %icc, 13b + mov %o0, %o4 + ba,a,pt %xcc, 1f + + /* Check every byte. */ +82: srl %o5, 24, %g7 + andcc %g7, 0xff, %g0 + be,pn %icc, 1f + add %o0, -3, %o4 + srl %o5, 16, %g7 + andcc %g7, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + srl %o5, 8, %g7 + andcc %g7, 0xff, %g0 + be,pn %icc, 1f + add %o4, 1, %o4 + andcc %o5, 0xff, %g0 + bne,pt %icc, 81b + sub %o0, %o1, %g2 + add %o4, 1, %o4 +1: retl + sub %o4, %o1, %o0 +21: retl + mov 1, %o0 +22: retl + mov 2, %o0 +23: retl + mov 3, %o0 + + .section .fixup,#alloc,#execinstr + .align 4 +30: + retl + clr %o0 + + .section __ex_table,"a" + .align 4 + + .word 10b, 30b + .word 11b, 30b + .word 12b, 30b + .word 15b, 30b + .word 13b, 30b diff --git a/arch/sparc/lib/strncpy_from_user_32.S b/arch/sparc/lib/strncpy_from_user_32.S new file mode 100644 index 00000000000..d77198976a6 --- /dev/null +++ b/arch/sparc/lib/strncpy_from_user_32.S @@ -0,0 +1,47 @@ +/* strncpy_from_user.S: Sparc strncpy from userspace. + * + * Copyright(C) 1996 David S. Miller + */ + +#include +#include + + .text + .align 4 + + /* Must return: + * + * -EFAULT for an exception + * count if we hit the buffer limit + * bytes copied if we hit a null byte + */ + + .globl __strncpy_from_user +__strncpy_from_user: + /* %o0=dest, %o1=src, %o2=count */ + mov %o2, %o3 +1: + subcc %o2, 1, %o2 + bneg 2f + nop +10: + ldub [%o1], %o4 + add %o0, 1, %o0 + cmp %o4, 0 + add %o1, 1, %o1 + bne 1b + stb %o4, [%o0 - 1] +2: + add %o2, 1, %o0 + retl + sub %o3, %o0, %o0 + + .section .fixup,#alloc,#execinstr + .align 4 +4: + retl + mov -EFAULT, %o0 + + .section __ex_table,#alloc + .align 4 + .word 10b, 4b diff --git a/arch/sparc/lib/strncpy_from_user_64.S b/arch/sparc/lib/strncpy_from_user_64.S new file mode 100644 index 00000000000..511c8f136f9 --- /dev/null +++ b/arch/sparc/lib/strncpy_from_user_64.S @@ -0,0 +1,135 @@ +/* + * strncpy_from_user.S: Sparc64 strncpy from userspace. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + */ + +#include +#include + + .data + .align 8 +0: .xword 0x0101010101010101 + + .text + .align 32 + + /* Must return: + * + * -EFAULT for an exception + * count if we hit the buffer limit + * bytes copied if we hit a null byte + * (without the null byte) + * + * This implementation assumes: + * %o1 is 8 aligned => !(%o2 & 7) + * %o0 is 8 aligned (if not, it will be slooooow, but will work) + * + * This is optimized for the common case: + * in my stats, 90% of src are 8 aligned (even on sparc32) + * and average length is 18 or so. + */ + + .globl __strncpy_from_user + .type __strncpy_from_user,#function +__strncpy_from_user: + /* %o0=dest, %o1=src, %o2=count */ + andcc %o1, 7, %g0 ! IEU1 Group + bne,pn %icc, 30f ! CTI + add %o0, %o2, %g3 ! IEU0 +60: ldxa [%o1] %asi, %g1 ! Load Group + brlez,pn %o2, 10f ! CTI + mov %o0, %o3 ! IEU0 +50: sethi %hi(0b), %o4 ! IEU0 Group + ldx [%o4 + %lo(0b)], %o4 ! Load + sllx %o4, 7, %o5 ! IEU1 Group +1: sub %g1, %o4, %g2 ! IEU0 Group + stx %g1, [%o0] ! Store + add %o0, 8, %o0 ! IEU1 + andcc %g2, %o5, %g0 ! IEU1 Group + bne,pn %xcc, 5f ! CTI + add %o1, 8, %o1 ! IEU0 + cmp %o0, %g3 ! IEU1 Group + bl,a,pt %xcc, 1b ! CTI +61: ldxa [%o1] %asi, %g1 ! Load +10: retl ! CTI Group + mov %o2, %o0 ! IEU0 +5: srlx %g2, 32, %g7 ! IEU0 Group + sethi %hi(0xff00), %o4 ! IEU1 + andcc %g7, %o5, %g0 ! IEU1 Group + be,pn %icc, 2f ! CTI + or %o4, %lo(0xff00), %o4 ! IEU0 + srlx %g1, 48, %g7 ! IEU0 Group + andcc %g7, %o4, %g0 ! IEU1 Group + be,pn %icc, 50f ! CTI + andcc %g7, 0xff, %g0 ! IEU1 Group + be,pn %icc, 51f ! CTI + srlx %g1, 32, %g7 ! IEU0 + andcc %g7, %o4, %g0 ! IEU1 Group + be,pn %icc, 52f ! CTI + andcc %g7, 0xff, %g0 ! IEU1 Group + be,pn %icc, 53f ! CTI +2: andcc %g2, %o5, %g0 ! IEU1 Group + be,pn %icc, 2f ! CTI + srl %g1, 16, %g7 ! IEU0 + andcc %g7, %o4, %g0 ! IEU1 Group + be,pn %icc, 54f ! CTI + andcc %g7, 0xff, %g0 ! IEU1 Group + be,pn %icc, 55f ! CTI + andcc %g1, %o4, %g0 ! IEU1 Group + be,pn %icc, 56f ! CTI + andcc %g1, 0xff, %g0 ! IEU1 Group + be,a,pn %icc, 57f ! CTI + sub %o0, %o3, %o0 ! IEU0 +2: cmp %o0, %g3 ! IEU1 Group + bl,a,pt %xcc, 50b ! CTI +62: ldxa [%o1] %asi, %g1 ! Load + retl ! CTI Group + mov %o2, %o0 ! IEU0 +50: sub %o0, %o3, %o0 + retl + sub %o0, 8, %o0 +51: sub %o0, %o3, %o0 + retl + sub %o0, 7, %o0 +52: sub %o0, %o3, %o0 + retl + sub %o0, 6, %o0 +53: sub %o0, %o3, %o0 + retl + sub %o0, 5, %o0 +54: sub %o0, %o3, %o0 + retl + sub %o0, 4, %o0 +55: sub %o0, %o3, %o0 + retl + sub %o0, 3, %o0 +56: sub %o0, %o3, %o0 + retl + sub %o0, 2, %o0 +57: retl + sub %o0, 1, %o0 +30: brlez,pn %o2, 3f + sub %g0, %o2, %o3 + add %o0, %o2, %o0 +63: lduba [%o1] %asi, %o4 +1: add %o1, 1, %o1 + brz,pn %o4, 2f + stb %o4, [%o0 + %o3] + addcc %o3, 1, %o3 + bne,pt %xcc, 1b +64: lduba [%o1] %asi, %o4 +3: retl + mov %o2, %o0 +2: retl + add %o2, %o3, %o0 + .size __strncpy_from_user, .-__strncpy_from_user + + .section __ex_table,"a" + .align 4 + .word 60b, __retl_efault + .word 61b, __retl_efault + .word 62b, __retl_efault + .word 63b, __retl_efault + .word 64b, __retl_efault + .previous diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S new file mode 100644 index 00000000000..2101405bdfc --- /dev/null +++ b/arch/sparc/lib/udiv.S @@ -0,0 +1,357 @@ +/* + * udiv.S: This routine was taken from glibc-1.09 and is covered + * by the GNU Library General Public License Version 2. + */ + + +/* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .udiv name of function to generate + * div div=div => %o0 / %o1; div=rem => %o0 % %o1 + * false false=true => signed; false=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + .globl .udiv + .globl _Udiv +.udiv: +_Udiv: /* needed for export */ + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu Lgot_result ! (and algorithm fails otherwise) + clr %o2 + + sethi %hi(1 << (32 - 4 - 1)), %g1 + + cmp %o3, %g1 + blu Lnot_really_big + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g7 + + sll %o5, 4, %o5 + + b 1b + add %o4, 1, %o4 + + ! Now compute %g7. + 2: + addcc %o5, %o5, %o5 + bcc Lnot_too_big + add %g7, 1, %g7 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + + b Ldo_single_div + sub %g7, 1, %g7 + + Lnot_too_big: + 3: + cmp %o5, %o3 + blu 2b + nop + + be Ldo_single_div + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g7 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + Ldo_single_div: + subcc %g7, 1, %g7 + bl Lend_regular_divide + nop + + sub %o3, %o5, %o3 + mov 1, %o2 + + b Lend_single_divloop + nop + Lsingle_divloop: + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + Lend_single_divloop: + subcc %g7, 1, %g7 + bge Lsingle_divloop + tst %o3 + + b,a Lend_regular_divide + +Lnot_really_big: +1: + sll %o5, 4, %o5 + + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + + be Lgot_result + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +Ldivloop: + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl L.1.16 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl L.2.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl L.3.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl L.4.23 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +L.4.23: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + +L.3.19: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl L.4.21 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +L.4.21: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + +L.2.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl L.3.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl L.4.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +L.4.19: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + +L.3.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl L.4.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +L.4.17: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + +L.1.16: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl L.2.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl L.3.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl L.4.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +L.4.15: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + +L.3.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl L.4.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +L.4.13: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + +L.2.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl L.3.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl L.4.11 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +L.4.11: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + +L.3.13: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl L.4.9 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +L.4.9: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + 9: +Lend_regular_divide: + subcc %o4, 1, %o4 + bge Ldivloop + tst %o3 + + bl,a Lgot_result + ! non-restoring fixup here (one instruction only!) + sub %o2, 1, %o2 + +Lgot_result: + + retl + mov %o2, %o0 + + .globl .udiv_patch +.udiv_patch: + wr %g0, 0x0, %y + nop + nop + retl + udiv %o0, %o1, %o0 + nop diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S new file mode 100644 index 00000000000..1f36ae68252 --- /dev/null +++ b/arch/sparc/lib/umul.S @@ -0,0 +1,171 @@ +/* + * umul.S: This routine was taken from glibc-1.09 and is covered + * by the GNU Library General Public License Version 2. + */ + + +/* + * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the + * upper 32 bits of the 64-bit product). + * + * This code optimizes short (less than 13-bit) multiplies. Short + * multiplies require 25 instruction cycles, and long ones require + * 45 instruction cycles. + * + * On return, overflow has occurred (%o1 is not zero) if and only if + * the Z condition code is clear, allowing, e.g., the following: + * + * call .umul + * nop + * bnz overflow (or tnz) + */ + + .globl .umul + .globl _Umul +.umul: +_Umul: /* needed for export */ + or %o0, %o1, %o4 + mov %o0, %y ! multiplier -> Y + + andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args + be Lmul_shortway ! if zero, can do it the short way + andcc %g0, %g0, %o4 ! zero the partial product and clear N and V + + /* + * Long multiply. 32 steps, followed by a final shift step. + */ + mulscc %o4, %o1, %o4 ! 1 + mulscc %o4, %o1, %o4 ! 2 + mulscc %o4, %o1, %o4 ! 3 + mulscc %o4, %o1, %o4 ! 4 + mulscc %o4, %o1, %o4 ! 5 + mulscc %o4, %o1, %o4 ! 6 + mulscc %o4, %o1, %o4 ! 7 + mulscc %o4, %o1, %o4 ! 8 + mulscc %o4, %o1, %o4 ! 9 + mulscc %o4, %o1, %o4 ! 10 + mulscc %o4, %o1, %o4 ! 11 + mulscc %o4, %o1, %o4 ! 12 + mulscc %o4, %o1, %o4 ! 13 + mulscc %o4, %o1, %o4 ! 14 + mulscc %o4, %o1, %o4 ! 15 + mulscc %o4, %o1, %o4 ! 16 + mulscc %o4, %o1, %o4 ! 17 + mulscc %o4, %o1, %o4 ! 18 + mulscc %o4, %o1, %o4 ! 19 + mulscc %o4, %o1, %o4 ! 20 + mulscc %o4, %o1, %o4 ! 21 + mulscc %o4, %o1, %o4 ! 22 + mulscc %o4, %o1, %o4 ! 23 + mulscc %o4, %o1, %o4 ! 24 + mulscc %o4, %o1, %o4 ! 25 + mulscc %o4, %o1, %o4 ! 26 + mulscc %o4, %o1, %o4 ! 27 + mulscc %o4, %o1, %o4 ! 28 + mulscc %o4, %o1, %o4 ! 29 + mulscc %o4, %o1, %o4 ! 30 + mulscc %o4, %o1, %o4 ! 31 + mulscc %o4, %o1, %o4 ! 32 + mulscc %o4, %g0, %o4 ! final shift + + + /* + * Normally, with the shift-and-add approach, if both numbers are + * positive you get the correct result. With 32-bit two's-complement + * numbers, -x is represented as + * + * x 32 + * ( 2 - ------ ) mod 2 * 2 + * 32 + * 2 + * + * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, + * we can treat this as if the radix point were just to the left + * of the sign bit (multiply by 2^32), and get + * + * -x = (2 - x) mod 2 + * + * Then, ignoring the `mod 2's for convenience: + * + * x * y = xy + * -x * y = 2y - xy + * x * -y = 2x - xy + * -x * -y = 4 - 2x - 2y + xy + * + * For signed multiplies, we subtract (x << 32) from the partial + * product to fix this problem for negative multipliers (see mul.s). + * Because of the way the shift into the partial product is calculated + * (N xor V), this term is automatically removed for the multiplicand, + * so we don't have to adjust. + * + * But for unsigned multiplies, the high order bit wasn't a sign bit, + * and the correction is wrong. So for unsigned multiplies where the + * high order bit is one, we end up with xy - (y << 32). To fix it + * we add y << 32. + */ +#if 0 + tst %o1 + bl,a 1f ! if %o1 < 0 (high order bit = 1), + add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) + +1: + rd %y, %o0 ! get lower half of product + retl + addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 +#else + /* Faster code from tege@sics.se. */ + sra %o1, 31, %o2 ! make mask from sign bit + and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 + rd %y, %o0 ! get lower half of product + retl + addcc %o4, %o2, %o1 ! add compensation and put upper half in place +#endif + +Lmul_shortway: + /* + * Short multiply. 12 steps, followed by a final shift step. + * The resulting bits are off by 12 and (32-12) = 20 bit positions, + * but there is no problem with %o0 being negative (unlike above), + * and overflow is impossible (the answer is at most 24 bits long). + */ + mulscc %o4, %o1, %o4 ! 1 + mulscc %o4, %o1, %o4 ! 2 + mulscc %o4, %o1, %o4 ! 3 + mulscc %o4, %o1, %o4 ! 4 + mulscc %o4, %o1, %o4 ! 5 + mulscc %o4, %o1, %o4 ! 6 + mulscc %o4, %o1, %o4 ! 7 + mulscc %o4, %o1, %o4 ! 8 + mulscc %o4, %o1, %o4 ! 9 + mulscc %o4, %o1, %o4 ! 10 + mulscc %o4, %o1, %o4 ! 11 + mulscc %o4, %o1, %o4 ! 12 + mulscc %o4, %g0, %o4 ! final shift + + /* + * %o4 has 20 of the bits that should be in the result; %y has + * the bottom 12 (as %y's top 12). That is: + * + * %o4 %y + * +----------------+----------------+ + * | -12- | -20- | -12- | -20- | + * +------(---------+------)---------+ + * -----result----- + * + * The 12 bits of %o4 left of the `result' area are all zero; + * in fact, all top 20 bits of %o4 are zero. + */ + + rd %y, %o5 + sll %o4, 12, %o0 ! shift middle bits left 12 + srl %o5, 20, %o5 ! shift low bits right 20 + or %o5, %o0, %o0 + retl + addcc %g0, %g0, %o1 ! %o1 = zero, and set Z + + .globl .umul_patch +.umul_patch: + umul %o0, %o1, %o0 + retl + rd %y, %o1 + nop diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S new file mode 100644 index 00000000000..77123eb83c4 --- /dev/null +++ b/arch/sparc/lib/urem.S @@ -0,0 +1,357 @@ +/* + * urem.S: This routine was taken from glibc-1.09 and is covered + * by the GNU Library General Public License Version 2. + */ + +/* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .urem name of function to generate + * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 + * false false=true => signed; false=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + .globl .urem + .globl _Urem +.urem: +_Urem: /* needed for export */ + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu Lgot_result ! (and algorithm fails otherwise) + clr %o2 + + sethi %hi(1 << (32 - 4 - 1)), %g1 + + cmp %o3, %g1 + blu Lnot_really_big + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g7 + + sll %o5, 4, %o5 + + b 1b + add %o4, 1, %o4 + + ! Now compute %g7. + 2: + addcc %o5, %o5, %o5 + bcc Lnot_too_big + add %g7, 1, %g7 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + + b Ldo_single_div + sub %g7, 1, %g7 + + Lnot_too_big: + 3: + cmp %o5, %o3 + blu 2b + nop + + be Ldo_single_div + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g7 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + Ldo_single_div: + subcc %g7, 1, %g7 + bl Lend_regular_divide + nop + + sub %o3, %o5, %o3 + mov 1, %o2 + + b Lend_single_divloop + nop + Lsingle_divloop: + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + Lend_single_divloop: + subcc %g7, 1, %g7 + bge Lsingle_divloop + tst %o3 + + b,a Lend_regular_divide + +Lnot_really_big: +1: + sll %o5, 4, %o5 + + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + + be Lgot_result + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +Ldivloop: + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl L.1.16 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl L.2.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl L.3.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl L.4.23 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +L.4.23: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + +L.3.19: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl L.4.21 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +L.4.21: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + +L.2.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl L.3.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl L.4.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +L.4.19: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + +L.3.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl L.4.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +L.4.17: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + +L.1.16: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl L.2.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl L.3.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl L.4.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +L.4.15: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + +L.3.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl L.4.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +L.4.13: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + +L.2.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl L.3.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl L.4.11 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +L.4.11: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + +L.3.13: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl L.4.9 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +L.4.9: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + 9: +Lend_regular_divide: + subcc %o4, 1, %o4 + bge Ldivloop + tst %o3 + + bl,a Lgot_result + ! non-restoring fixup here (one instruction only!) + add %o3, %o1, %o3 + +Lgot_result: + + retl + mov %o3, %o0 + + .globl .urem_patch +.urem_patch: + wr %g0, 0x0, %y + nop + nop + nop + udiv %o0, %o1, %o2 + umul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 -- cgit v1.2.2