Added missing tegra files.HEAD master

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-22 10:38:37 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-22 10:38:37 -0500
commit: fcc9d2e5a6c89d22b8b773a64fb4ad21ac318446 (patch)
tree: a57612d1888735a2ec7972891b68c1ac5ec8faea /arch/alpha/lib
parent: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (diff)
4 files changed, 961 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S
new file mode 100644
index 00000000000..d2e28178cac
--- /dev/null
+++ b/arch/alpha/lib/ev6-strncpy_from_user.S
@@ -0,0 +1,424 @@
+/*
+ * arch/alpha/lib/ev6-strncpy_from_user.S
+ * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Just like strncpy except in the return value:
+ *
+ * -EFAULT       if an exception occurs before the terminator is copied.
+ * N             if the buffer filled.
+ *
+ * Otherwise the length of the string is returned.
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ * A bunch of instructions got moved and temp registers were changed
+ * to aid in scheduling.  Control flow was also re-arranged to eliminate
+ * branches, and to provide longer code sequences to enable better scheduling.
+ * A total rewrite (using byte load/stores for start & tail sequences)
+ * is desirable, but very difficult to do without a from-scratch rewrite.
+ * Save that for the future.
+ */
+#include <asm/errno.h>
+#include <asm/regdef.h>
+/* Allow an exception for an insn; exit if we get one.  */
+#define EX(x,y...)                      \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda $31, $exception-99b($0);    \
+        .previous
+        .set noat
+        .set noreorder
+        .text
+        .globl __strncpy_from_user
+        .ent __strncpy_from_user
+        .frame $30, 0, $26
+        .prologue 0
+        .align 4
+__strncpy_from_user:
+        and     a0, 7, t3       # E : find dest misalignment
+        beq     a2, $zerolength # U :
+        /* Are source and destination co-aligned?  */
+        mov     a0, v0          # E : save the string start
+        xor     a0, a1, t4      # E :
+        EX( ldq_u t1, 0(a1) )   # L : Latency=3 load first quadword
+        ldq_u   t0, 0(a0)       # L : load first (partial) aligned dest quadword
+        addq    a2, t3, a2      # E : bias count by dest misalignment
+        subq    a2, 1, a3       # E :
+        addq    zero, 1, t10    # E :
+        and     t4, 7, t4       # E : misalignment between the two
+        and     a3, 7, t6       # E : number of tail bytes
+        sll     t10, t6, t10    # E : t10 = bitmask of last count byte
+        bne     t4, $unaligned  # U :
+        lda     t2, -1          # E : build a mask against false zero
+        /*
+         * We are co-aligned; take care of a partial first word.
+         * On entry to this basic block:
+         * t0 == the first destination word for masking back in
+         * t1 == the first source word.
+         */
+        srl     a3, 3, a2       # E : a2 = loop counter = (count - 1)/8
+        addq    a1, 8, a1       # E :
+        mskqh   t2, a1, t2      # U :   detection in the src word
+        nop
+        /* Create the 1st output word and detect 0's in the 1st input word.  */
+        mskqh   t1, a1, t3      # U :
+        mskql   t0, a1, t0      # U : assemble the first output word
+        ornot   t1, t2, t2      # E :
+        nop
+        cmpbge  zero, t2, t8    # E : bits set iff null found
+        or      t0, t3, t0      # E :
+        beq     a2, $a_eoc      # U :
+        bne     t8, $a_eos      # U : 2nd branch in a quad.  Bad.
+        /* On entry to this basic block:
+         * t0 == a source quad not containing a null.
+         * a0 - current aligned destination address
+         * a1 - current aligned source address
+         * a2 - count of quadwords to move.
+         * NOTE: Loop improvement - unrolling this is going to be
+         *      a huge win, since we're going to stall otherwise.
+         *      Fix this later.  For _really_ large copies, look
+         *      at using wh64 on a look-ahead basis.  See the code
+         *      in clear_user.S and copy_user.S.
+         * Presumably, since (a0) and (a1) do not overlap (by C definition)
+         * Lots of nops here:
+         *      - Separate loads from stores
+         *      - Keep it to 1 branch/quadpack so the branch predictor
+         *        can train.
+         */
+$a_loop:
+        stq_u   t0, 0(a0)       # L :
+        addq    a0, 8, a0       # E :
+        nop
+        subq    a2, 1, a2       # E :
+        EX( ldq_u t0, 0(a1) )   # L :
+        addq    a1, 8, a1       # E :
+        cmpbge  zero, t0, t8    # E : Stall 2 cycles on t0
+        beq     a2, $a_eoc      # U :
+        beq     t8, $a_loop     # U :
+        nop
+        nop
+        nop
+        /* Take care of the final (partial) word store.  At this point
+         * the end-of-count bit is set in t8 iff it applies.
+         *
+         * On entry to this basic block we have:
+         * t0 == the source word containing the null
+         * t8 == the cmpbge mask that found it.
+         */
+$a_eos:
+        negq    t8, t12         # E : find low bit set
+        and     t8, t12, t12    # E : 
+        /* We're doing a partial word store and so need to combine
+           our source and original destination words.  */
+        ldq_u   t1, 0(a0)       # L :
+        subq    t12, 1, t6      # E :
+        or      t12, t6, t8     # E :
+        zapnot  t0, t8, t0      # U : clear src bytes > null
+        zap     t1, t8, t1      # U : clear dst bytes <= null
+        or      t0, t1, t0      # E :
+        stq_u   t0, 0(a0)       # L :
+        br      $finish_up      # L0 :
+        nop
+        nop
+        /* Add the end-of-count bit to the eos detection bitmask.  */
+        .align 4
+$a_eoc:
+        or      t10, t8, t8
+        br      $a_eos
+        nop
+        nop
+/* The source and destination are not co-aligned.  Align the destination
+   and cope.  We have to be very careful about not reading too much and
+   causing a SEGV.  */
+        .align 4
+$u_head:
+        /* We know just enough now to be able to assemble the first
+           full source word.  We can still find a zero at the end of it
+           that prevents us from outputting the whole thing.
+           On entry to this basic block:
+           t0 == the first dest word, unmasked
+           t1 == the shifted low bits of the first source word
+           t6 == bytemask that is -1 in dest word bytes */
+        EX( ldq_u t2, 8(a1) )   # L : load second src word
+        addq    a1, 8, a1       # E :
+        mskql   t0, a0, t0      # U : mask trailing garbage in dst
+        extqh   t2, a1, t4      # U :
+        or      t1, t4, t1      # E : first aligned src word complete
+        mskqh   t1, a0, t1      # U : mask leading garbage in src
+        or      t0, t1, t0      # E : first output word complete
+        or      t0, t6, t6      # E : mask original data for zero test
+        cmpbge  zero, t6, t8    # E :
+        beq     a2, $u_eocfin   # U :
+        bne     t8, $u_final    # U : bad news - 2nd branch in a quad
+        lda     t6, -1          # E : mask out the bits we have
+        mskql   t6, a1, t6      # U :   already seen
+        stq_u   t0, 0(a0)       # L : store first output word
+        or      t6, t2, t2      # E :
+        cmpbge  zero, t2, t8    # E : find nulls in second partial
+        addq    a0, 8, a0               # E :
+        subq    a2, 1, a2               # E :
+        bne     t8, $u_late_head_exit   # U :
+        nop
+        /* Finally, we've got all the stupid leading edge cases taken care
+           of and we can set up to enter the main loop.  */
+        extql   t2, a1, t1      # U : position hi-bits of lo word
+        EX( ldq_u t2, 8(a1) )   # L : read next high-order source word
+        addq    a1, 8, a1       # E :
+        cmpbge  zero, t2, t8    # E :
+        beq     a2, $u_eoc      # U :
+        bne     t8, $u_eos      # U :
+        nop
+        nop
+        /* Unaligned copy main loop.  In order to avoid reading too much,
+           the loop is structured to detect zeros in aligned source words.
+           This has, unfortunately, effectively pulled half of a loop
+           iteration out into the head and half into the tail, but it does
+           prevent nastiness from accumulating in the very thing we want
+           to run as fast as possible.
+           On entry to this basic block:
+           t1 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word
+           We further know that t2 does not contain a null terminator.  */
+        /*
+         * Extra nops here:
+         *      separate load quads from store quads
+         *      only one branch/quad to permit predictor training
+         */
+        .align 4
+$u_loop:
+        extqh   t2, a1, t0      # U : extract high bits for current word
+        addq    a1, 8, a1       # E :
+        extql   t2, a1, t3      # U : extract low bits for next time
+        addq    a0, 8, a0       # E :
+        or      t0, t1, t0      # E : current dst word now complete
+        EX( ldq_u t2, 0(a1) )   # L : load high word for next time
+        subq    a2, 1, a2       # E :
+        nop
+        stq_u   t0, -8(a0)      # L : save the current word
+        mov     t3, t1          # E :
+        cmpbge  zero, t2, t8    # E : test new word for eos
+        beq     a2, $u_eoc      # U :
+        beq     t8, $u_loop     # U :
+        nop
+        nop
+        nop
+        /* We've found a zero somewhere in the source word we just read.
+           If it resides in the lower half, we have one (probably partial)
+           word to write out, and if it resides in the upper half, we
+           have one full and one partial word left to write out.
+           On entry to this basic block:
+           t1 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word.  */
+        .align 4
+$u_eos:
+        extqh   t2, a1, t0      # U :
+        or      t0, t1, t0      # E : first (partial) source word complete
+        cmpbge  zero, t0, t8    # E : is the null in this first bit?
+        nop
+        bne     t8, $u_final    # U :
+        stq_u   t0, 0(a0)       # L : the null was in the high-order bits
+        addq    a0, 8, a0       # E :
+        subq    a2, 1, a2       # E :
+        .align 4
+$u_late_head_exit:
+        extql   t2, a1, t0      # U :
+        cmpbge  zero, t0, t8    # E :
+        or      t8, t10, t6     # E :
+        cmoveq  a2, t6, t8      # E :
+        /* Take care of a final (probably partial) result word.
+           On entry to this basic block:
+           t0 == assembled source word
+           t8 == cmpbge mask that found the null.  */
+        .align 4
+$u_final:
+        negq    t8, t6          # E : isolate low bit set
+        and     t6, t8, t12     # E :
+        ldq_u   t1, 0(a0)       # L :
+        subq    t12, 1, t6      # E :
+        or      t6, t12, t8     # E :
+        zapnot  t0, t8, t0      # U : kill source bytes > null
+        zap     t1, t8, t1      # U : kill dest bytes <= null
+        or      t0, t1, t0      # E :
+        stq_u   t0, 0(a0)       # E :
+        br      $finish_up      # U :
+        nop
+        nop
+        .align 4
+$u_eoc:                         # end-of-count
+        extqh   t2, a1, t0      # U :
+        or      t0, t1, t0      # E :
+        cmpbge  zero, t0, t8    # E :
+        nop
+        .align 4
+$u_eocfin:                      # end-of-count, final word
+        or      t10, t8, t8     # E :
+        br      $u_final        # U :
+        nop
+        nop
+        /* Unaligned copy entry point.  */
+        .align 4
+$unaligned:
+        srl     a3, 3, a2       # U : a2 = loop counter = (count - 1)/8
+        and     a0, 7, t4       # E : find dest misalignment
+        and     a1, 7, t5       # E : find src misalignment
+        mov     zero, t0        # E :
+        /* Conditionally load the first destination word and a bytemask
+           with 0xff indicating that the destination byte is sacrosanct.  */
+        mov     zero, t6        # E :
+        beq     t4, 1f          # U :
+        ldq_u   t0, 0(a0)       # L :
+        lda     t6, -1          # E :
+        mskql   t6, a0, t6      # E :
+        nop
+        nop
+        nop
+        .align 4
+1:
+        subq    a1, t4, a1      # E : sub dest misalignment from src addr
+        /* If source misalignment is larger than dest misalignment, we need
+           extra startup checks to avoid SEGV.  */
+        cmplt   t4, t5, t12     # E :
+        extql   t1, a1, t1      # U : shift src into place
+        lda     t2, -1          # E : for creating masks later
+        beq     t12, $u_head    # U :
+        mskqh   t2, t5, t2      # U : begin src byte validity mask
+        cmpbge  zero, t1, t8    # E : is there a zero?
+        nop
+        extql   t2, a1, t2      # U :
+        or      t8, t10, t5     # E : test for end-of-count too
+        cmpbge  zero, t2, t3    # E :
+        cmoveq  a2, t5, t8      # E : Latency=2, extra map slot
+        nop                     # E : goes with cmov
+        andnot  t8, t3, t8      # E :
+        beq     t8, $u_head     # U :
+        nop
+        /* At this point we've found a zero in the first partial word of
+           the source.  We need to isolate the valid source data and mask
+           it into the original destination data.  (Incidentally, we know
+           that we'll need at least one byte of that original dest word.) */
+        ldq_u   t0, 0(a0)       # L :
+        negq    t8, t6          # E : build bitmask of bytes <= zero
+        mskqh   t1, t4, t1      # U :
+        and     t6, t8, t12     # E :
+        subq    t12, 1, t6      # E :
+        or      t6, t12, t8     # E :
+        zapnot  t2, t8, t2      # U : prepare source word; mirror changes
+        zapnot  t1, t8, t1      # U : to source validity mask
+        andnot  t0, t2, t0      # E : zero place for source to reside
+        or      t0, t1, t0      # E : and put it there
+        stq_u   t0, 0(a0)       # L :
+        nop
+        .align 4
+$finish_up:
+        zapnot  t0, t12, t4     # U : was last byte written null?
+        and     t12, 0xf0, t3   # E : binary search for the address of the
+        cmovne  t4, 1, t4       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        and     t12, 0xcc, t2   # E : last byte written
+        and     t12, 0xaa, t1   # E :
+        cmovne  t3, 4, t3       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        bic     a0, 7, t0
+        cmovne  t2, 2, t2       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        nop
+        cmovne  t1, 1, t1       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        addq    t0, t3, t0      # E :
+        addq    t1, t2, t1      # E :
+        addq    t0, t1, t0      # E :
+        addq    t0, t4, t0      # add one if we filled the buffer
+        subq    t0, v0, v0      # find string length
+        ret                     # L0 :
+        .align 4
+$zerolength:
+        nop
+        nop
+        nop
+        clr     v0
+$exception:
+        nop
+        nop
+        nop
+        ret
+        .end __strncpy_from_user
diff --git a/arch/alpha/lib/ev67-strlen_user.S b/arch/alpha/lib/ev67-strlen_user.S
new file mode 100644
index 00000000000..57e0d77b81a
--- /dev/null
+++ b/arch/alpha/lib/ev67-strlen_user.S
@@ -0,0 +1,107 @@
+/*
+ * arch/alpha/lib/ev67-strlen_user.S
+ * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
+ *
+ * Return the length of the string including the NULL terminator
+ * (strlen+1) or zero if an error occurred.
+ *
+ * In places where it is critical to limit the processing time,
+ * and the data is not trusted, strnlen_user() should be used.
+ * It will return a value greater than its second argument if
+ * that limit would be exceeded. This implementation is allowed
+ * to access memory beyond the limit, but will not cross a page
+ * boundary when doing so.
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ * Try not to change the actual algorithm if possible for consistency.
+ */
+#include <asm/regdef.h>
+/* Allow an exception for an insn; exit if we get one.  */
+#define EX(x,y...)                      \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda v0, $exception-99b(zero);   \
+        .previous
+        .set noreorder
+        .set noat
+        .text
+        .globl __strlen_user
+        .ent __strlen_user
+        .frame sp, 0, ra
+        .align 4
+__strlen_user:
+        ldah    a1, 32767(zero) # do not use plain strlen_user() for strings
+                                # that might be almost 2 GB long; you should
+                                # be using strnlen_user() instead
+        nop
+        nop
+        nop
+        .globl __strnlen_user
+        .align 4
+__strnlen_user:
+        .prologue 0
+        EX( ldq_u t0, 0(a0) )   # L : load first quadword (a0 may be misaligned)
+        lda     t1, -1(zero)    # E :
+        insqh   t1, a0, t1      # U :
+        andnot  a0, 7, v0       # E :
+        or      t1, t0, t0      # E :
+        subq    a0, 1, a0       # E : get our +1 for the return 
+        cmpbge  zero, t0, t1    # E : t1 <- bitmask: bit i == 1 <==> i-th byte == 0
+        subq    a1, 7, t2       # E :
+        subq    a0, v0, t0      # E :
+        bne     t1, $found      # U :
+        addq    t2, t0, t2      # E :
+        addq    a1, 1, a1       # E :
+        nop                     # E :
+        nop                     # E :
+        .align 4
+$loop:  ble     t2, $limit      # U :
+        EX( ldq t0, 8(v0) )     # L :
+        nop                     # E :
+        nop                     # E :
+        cmpbge  zero, t0, t1    # E :
+        subq    t2, 8, t2       # E :
+        addq    v0, 8, v0       # E : addr += 8
+        beq     t1, $loop       # U :
+$found: cttz    t1, t2          # U0 :
+        addq    v0, t2, v0      # E :
+        subq    v0, a0, v0      # E :
+        ret                     # L0 :
+$exception:
+        nop
+        nop
+        nop
+        ret
+        .align 4                # currently redundant
+$limit:
+        nop
+        nop
+        subq    a1, t2, v0
+        ret
+        .end __strlen_user
diff --git a/arch/alpha/lib/strlen_user.S b/arch/alpha/lib/strlen_user.S
new file mode 100644
index 00000000000..508a18e9647
--- /dev/null
+++ b/arch/alpha/lib/strlen_user.S
@@ -0,0 +1,91 @@
+/*
+ * arch/alpha/lib/strlen_user.S
+ *
+ * Return the length of the string including the NUL terminator
+ * (strlen+1) or zero if an error occurred.
+ *
+ * In places where it is critical to limit the processing time,
+ * and the data is not trusted, strnlen_user() should be used.
+ * It will return a value greater than its second argument if
+ * that limit would be exceeded. This implementation is allowed
+ * to access memory beyond the limit, but will not cross a page
+ * boundary when doing so.
+ */
+#include <asm/regdef.h>
+/* Allow an exception for an insn; exit if we get one.  */
+#define EX(x,y...)                      \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda v0, $exception-99b(zero);   \
+        .previous
+        .set noreorder
+        .set noat
+        .text
+        .globl __strlen_user
+        .ent __strlen_user
+        .frame sp, 0, ra
+        .align 3
+__strlen_user:
+        ldah    a1, 32767(zero) # do not use plain strlen_user() for strings
+                                # that might be almost 2 GB long; you should
+                                # be using strnlen_user() instead
+        .globl __strnlen_user
+        .align 3
+__strnlen_user:
+        .prologue 0
+        EX( ldq_u t0, 0(a0) )   # load first quadword (a0 may be misaligned)
+        lda     t1, -1(zero)
+        insqh   t1, a0, t1
+        andnot  a0, 7, v0
+        or      t1, t0, t0
+        subq    a0, 1, a0       # get our +1 for the return 
+        cmpbge  zero, t0, t1    # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
+        subq    a1, 7, t2
+        subq    a0, v0, t0
+        bne     t1, $found
+        addq    t2, t0, t2
+        addq    a1, 1, a1
+        .align 3
+$loop:  ble     t2, $limit
+        EX( ldq t0, 8(v0) )
+        subq    t2, 8, t2
+        addq    v0, 8, v0       # addr += 8
+        cmpbge  zero, t0, t1
+        beq     t1, $loop
+$found: negq    t1, t2          # clear all but least set bit
+        and     t1, t2, t1
+        and     t1, 0xf0, t2    # binary search for that set bit
+        and     t1, 0xcc, t3
+        and     t1, 0xaa, t4
+        cmovne  t2, 4, t2
+        cmovne  t3, 2, t3
+        cmovne  t4, 1, t4
+        addq    t2, t3, t2
+        addq    v0, t4, v0
+        addq    v0, t2, v0
+        nop                     # dual issue next two on ev4 and ev5
+        subq    v0, a0, v0
+$exception:
+        ret
+        .align 3                # currently redundant
+$limit:
+        subq    a1, t2, v0
+        ret
+        .end __strlen_user
diff --git a/arch/alpha/lib/strncpy_from_user.S b/arch/alpha/lib/strncpy_from_user.S
new file mode 100644
index 00000000000..73ee21160ff
--- /dev/null
+++ b/arch/alpha/lib/strncpy_from_user.S
@@ -0,0 +1,339 @@
+/*
+ * arch/alpha/lib/strncpy_from_user.S
+ * Contributed by Richard Henderson (rth@tamu.edu)
+ *
+ * Just like strncpy except in the return value:
+ *
+ * -EFAULT       if an exception occurs before the terminator is copied.
+ * N             if the buffer filled.
+ *
+ * Otherwise the length of the string is returned.
+ */
+#include <asm/errno.h>
+#include <asm/regdef.h>
+/* Allow an exception for an insn; exit if we get one.  */
+#define EX(x,y...)                      \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda $31, $exception-99b($0);    \
+        .previous
+        .set noat
+        .set noreorder
+        .text
+        .globl __strncpy_from_user
+        .ent __strncpy_from_user
+        .frame $30, 0, $26
+        .prologue 0
+        .align 3
+$aligned:
+        /* On entry to this basic block:
+           t0 == the first destination word for masking back in
+           t1 == the first source word.  */
+        /* Create the 1st output word and detect 0's in the 1st input word.  */
+        lda     t2, -1          # e1    : build a mask against false zero
+        mskqh   t2, a1, t2      # e0    :   detection in the src word
+        mskqh   t1, a1, t3      # e0    :
+        ornot   t1, t2, t2      # .. e1 :
+        mskql   t0, a1, t0      # e0    : assemble the first output word
+        cmpbge  zero, t2, t8    # .. e1 : bits set iff null found
+        or      t0, t3, t0      # e0    :
+        beq     a2, $a_eoc      # .. e1 :
+        bne     t8, $a_eos      # .. e1 :
+        /* On entry to this basic block:
+           t0 == a source word not containing a null.  */
+$a_loop:
+        stq_u   t0, 0(a0)       # e0    :
+        addq    a0, 8, a0       # .. e1 :
+        EX( ldq_u t0, 0(a1) )   # e0    :
+        addq    a1, 8, a1       # .. e1 :
+        subq    a2, 1, a2       # e0    :
+        cmpbge  zero, t0, t8    # .. e1 (stall)
+        beq     a2, $a_eoc      # e1    :
+        beq     t8, $a_loop     # e1    :
+        /* Take care of the final (partial) word store.  At this point
+           the end-of-count bit is set in t8 iff it applies.
+           On entry to this basic block we have:
+           t0 == the source word containing the null
+           t8 == the cmpbge mask that found it.  */
+$a_eos:
+        negq    t8, t12         # e0    : find low bit set
+        and     t8, t12, t12    # e1 (stall)
+        /* For the sake of the cache, don't read a destination word
+           if we're not going to need it.  */
+        and     t12, 0x80, t6   # e0    :
+        bne     t6, 1f          # .. e1 (zdb)
+        /* We're doing a partial word store and so need to combine
+           our source and original destination words.  */
+        ldq_u   t1, 0(a0)       # e0    :
+        subq    t12, 1, t6      # .. e1 :
+        or      t12, t6, t8     # e0    :
+        unop                    #
+        zapnot  t0, t8, t0      # e0    : clear src bytes > null
+        zap     t1, t8, t1      # .. e1 : clear dst bytes <= null
+        or      t0, t1, t0      # e1    :
+1:      stq_u   t0, 0(a0)
+        br      $finish_up
+        /* Add the end-of-count bit to the eos detection bitmask.  */
+$a_eoc:
+        or      t10, t8, t8
+        br      $a_eos
+        /*** The Function Entry Point ***/
+        .align 3
+__strncpy_from_user:
+        mov     a0, v0          # save the string start
+        beq     a2, $zerolength
+        /* Are source and destination co-aligned?  */
+        xor     a0, a1, t1      # e0    :
+        and     a0, 7, t0       # .. e1 : find dest misalignment
+        and     t1, 7, t1       # e0    :
+        addq    a2, t0, a2      # .. e1 : bias count by dest misalignment
+        subq    a2, 1, a2       # e0    :
+        and     a2, 7, t2       # e1    :
+        srl     a2, 3, a2       # e0    : a2 = loop counter = (count - 1)/8
+        addq    zero, 1, t10    # .. e1 :
+        sll     t10, t2, t10    # e0    : t10 = bitmask of last count byte
+        bne     t1, $unaligned  # .. e1 :
+        /* We are co-aligned; take care of a partial first word.  */
+        EX( ldq_u t1, 0(a1) )   # e0    : load first src word
+        addq    a1, 8, a1       # .. e1 :
+        beq     t0, $aligned    # avoid loading dest word if not needed
+        ldq_u   t0, 0(a0)       # e0    :
+        br      $aligned        # .. e1 :
+/* The source and destination are not co-aligned.  Align the destination
+   and cope.  We have to be very careful about not reading too much and
+   causing a SEGV.  */
+        .align 3
+$u_head:
+        /* We know just enough now to be able to assemble the first
+           full source word.  We can still find a zero at the end of it
+           that prevents us from outputting the whole thing.
+           On entry to this basic block:
+           t0 == the first dest word, unmasked
+           t1 == the shifted low bits of the first source word
+           t6 == bytemask that is -1 in dest word bytes */
+        EX( ldq_u t2, 8(a1) )   # e0    : load second src word
+        addq    a1, 8, a1       # .. e1 :
+        mskql   t0, a0, t0      # e0    : mask trailing garbage in dst
+        extqh   t2, a1, t4      # e0    :
+        or      t1, t4, t1      # e1    : first aligned src word complete
+        mskqh   t1, a0, t1      # e0    : mask leading garbage in src
+        or      t0, t1, t0      # e0    : first output word complete
+        or      t0, t6, t6      # e1    : mask original data for zero test
+        cmpbge  zero, t6, t8    # e0    :
+        beq     a2, $u_eocfin   # .. e1 :
+        bne     t8, $u_final    # e1    :
+        lda     t6, -1                  # e1    : mask out the bits we have
+        mskql   t6, a1, t6              # e0    :   already seen
+        stq_u   t0, 0(a0)               # e0    : store first output word
+        or      t6, t2, t2              # .. e1 :
+        cmpbge  zero, t2, t8            # e0    : find nulls in second partial
+        addq    a0, 8, a0               # .. e1 :
+        subq    a2, 1, a2               # e0    :
+        bne     t8, $u_late_head_exit   # .. e1 :
+        /* Finally, we've got all the stupid leading edge cases taken care
+           of and we can set up to enter the main loop.  */
+        extql   t2, a1, t1      # e0    : position hi-bits of lo word
+        EX( ldq_u t2, 8(a1) )   # .. e1 : read next high-order source word
+        addq    a1, 8, a1       # e0    :
+        cmpbge  zero, t2, t8    # e1 (stall)
+        beq     a2, $u_eoc      # e1    :
+        bne     t8, $u_eos      # e1    :
+        /* Unaligned copy main loop.  In order to avoid reading too much,
+           the loop is structured to detect zeros in aligned source words.
+           This has, unfortunately, effectively pulled half of a loop
+           iteration out into the head and half into the tail, but it does
+           prevent nastiness from accumulating in the very thing we want
+           to run as fast as possible.
+           On entry to this basic block:
+           t1 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word
+           We further know that t2 does not contain a null terminator.  */
+        .align 3
+$u_loop:
+        extqh   t2, a1, t0      # e0    : extract high bits for current word
+        addq    a1, 8, a1       # .. e1 :
+        extql   t2, a1, t3      # e0    : extract low bits for next time
+        addq    a0, 8, a0       # .. e1 :
+        or      t0, t1, t0      # e0    : current dst word now complete
+        EX( ldq_u t2, 0(a1) )   # .. e1 : load high word for next time
+        stq_u   t0, -8(a0)      # e0    : save the current word
+        mov     t3, t1          # .. e1 :
+        subq    a2, 1, a2       # e0    :
+        cmpbge  zero, t2, t8    # .. e1 : test new word for eos
+        beq     a2, $u_eoc      # e1    :
+        beq     t8, $u_loop     # e1    :
+        /* We've found a zero somewhere in the source word we just read.
+           If it resides in the lower half, we have one (probably partial)
+           word to write out, and if it resides in the upper half, we
+           have one full and one partial word left to write out.
+           On entry to this basic block:
+           t1 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word.  */
+$u_eos:
+        extqh   t2, a1, t0      # e0    :
+        or      t0, t1, t0      # e1    : first (partial) source word complete
+        cmpbge  zero, t0, t8    # e0    : is the null in this first bit?
+        bne     t8, $u_final    # .. e1 (zdb)
+        stq_u   t0, 0(a0)       # e0    : the null was in the high-order bits
+        addq    a0, 8, a0       # .. e1 :
+        subq    a2, 1, a2       # e1    :
+$u_late_head_exit:
+        extql   t2, a1, t0      # .. e0 :
+        cmpbge  zero, t0, t8    # e0    :
+        or      t8, t10, t6     # e1    :
+        cmoveq  a2, t6, t8      # e0    :
+        nop                     # .. e1 :
+        /* Take care of a final (probably partial) result word.
+           On entry to this basic block:
+           t0 == assembled source word
+           t8 == cmpbge mask that found the null.  */
+$u_final:
+        negq    t8, t6          # e0    : isolate low bit set
+        and     t6, t8, t12     # e1    :
+        and     t12, 0x80, t6   # e0    : avoid dest word load if we can
+        bne     t6, 1f          # .. e1 (zdb)
+        ldq_u   t1, 0(a0)       # e0    :
+        subq    t12, 1, t6      # .. e1 :
+        or      t6, t12, t8     # e0    :
+        zapnot  t0, t8, t0      # .. e1 : kill source bytes > null
+        zap     t1, t8, t1      # e0    : kill dest bytes <= null
+        or      t0, t1, t0      # e1    :
+1:      stq_u   t0, 0(a0)       # e0    :
+        br      $finish_up
+$u_eoc:                         # end-of-count
+        extqh   t2, a1, t0
+        or      t0, t1, t0
+        cmpbge  zero, t0, t8
+$u_eocfin:                      # end-of-count, final word
+        or      t10, t8, t8
+        br      $u_final
+        /* Unaligned copy entry point.  */
+        .align 3
+$unaligned:
+        EX( ldq_u t1, 0(a1) )   # e0    : load first source word
+        and     a0, 7, t4       # .. e1 : find dest misalignment
+        and     a1, 7, t5       # e0    : find src misalignment
+        /* Conditionally load the first destination word and a bytemask
+           with 0xff indicating that the destination byte is sacrosanct.  */
+        mov     zero, t0        # .. e1 :
+        mov     zero, t6        # e0    :
+        beq     t4, 1f          # .. e1 :
+        ldq_u   t0, 0(a0)       # e0    :
+        lda     t6, -1          # .. e1 :
+        mskql   t6, a0, t6      # e0    :
+1:
+        subq    a1, t4, a1      # .. e1 : sub dest misalignment from src addr
+        /* If source misalignment is larger than dest misalignment, we need
+           extra startup checks to avoid SEGV.  */
+        cmplt   t4, t5, t12     # e1    :
+        extql   t1, a1, t1      # .. e0 : shift src into place
+        lda     t2, -1          # e0    : for creating masks later
+        beq     t12, $u_head    # e1    :
+        mskqh   t2, t5, t2      # e0    : begin src byte validity mask
+        cmpbge  zero, t1, t8    # .. e1 : is there a zero?
+        extql   t2, a1, t2      # e0    :
+        or      t8, t10, t5     # .. e1 : test for end-of-count too
+        cmpbge  zero, t2, t3    # e0    :
+        cmoveq  a2, t5, t8      # .. e1 :
+        andnot  t8, t3, t8      # e0    :
+        beq     t8, $u_head     # .. e1 (zdb)
+        /* At this point we've found a zero in the first partial word of
+           the source.  We need to isolate the valid source data and mask
+           it into the original destination data.  (Incidentally, we know
+           that we'll need at least one byte of that original dest word.) */
+        ldq_u   t0, 0(a0)       # e0    :
+        negq    t8, t6          # .. e1 : build bitmask of bytes <= zero
+        mskqh   t1, t4, t1      # e0    :
+        and     t6, t8, t12     # .. e1 :
+        subq    t12, 1, t6      # e0    :
+        or      t6, t12, t8     # e1    :
+        zapnot  t2, t8, t2      # e0    : prepare source word; mirror changes
+        zapnot  t1, t8, t1      # .. e1 : to source validity mask
+        andnot  t0, t2, t0      # e0    : zero place for source to reside
+        or      t0, t1, t0      # e1    : and put it there
+        stq_u   t0, 0(a0)       # e0    :
+$finish_up:
+        zapnot  t0, t12, t4     # was last byte written null?
+        cmovne  t4, 1, t4
+        and     t12, 0xf0, t3   # binary search for the address of the
+        and     t12, 0xcc, t2   # last byte written
+        and     t12, 0xaa, t1
+        bic     a0, 7, t0
+        cmovne  t3, 4, t3
+        cmovne  t2, 2, t2
+        cmovne  t1, 1, t1
+        addq    t0, t3, t0
+        addq    t1, t2, t1
+        addq    t0, t1, t0
+        addq    t0, t4, t0      # add one if we filled the buffer
+        subq    t0, v0, v0      # find string length
+        ret
+$zerolength:
+        clr     v0
+$exception:
+        ret
+        .end __strncpy_from_user
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-22 10:38:37 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-22 10:38:37 -0500
commit	fcc9d2e5a6c89d22b8b773a64fb4ad21ac318446 (patch)
tree	a57612d1888735a2ec7972891b68c1ac5ec8faea /arch/alpha/lib
parent	8dea78da5cee153b8af9c07a2745f6c55057fe12 (diff)