crypto: sha-mb - SHA1 multibuffer submit and flush routines for AVX2

This patch introduces the routines used to submit and flush buffers belonging to SHA1 crypto jobs to the SHA1 multibuffer algorithm. It is implemented mostly in assembly optimized with AVX2 instructions. Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Tim Chen <tim.c.chen@linux.intel.com> 2014-07-31 13:29:57 -0400
committer: Herbert Xu <herbert@gondor.apana.org.au> 2014-08-25 08:32:28 -0400
commit: 2249cbb53ead12539c4ab7f422400e82263d174b (patch)
tree: ede26e6abda14de4966fde522c8e1fdd0f0e2bd2 /arch/x86/crypto
parent: 116177782392739f06868cfc2e6df5267aec4639 (diff)
3 files changed, 619 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..85c4e1cf7172
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,327 @@
+/*
+ * Flush routine for SHA1 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *      Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+.extern sha1_x8_avx2
+# LINUX register definitions
+#define arg1    %rdi
+#define arg2    %rsi
+# Common definitions
+#define state   arg1
+#define job     arg2
+#define len2    arg2
+# idx must be a register not clobbered by sha1_x8_avx2
+#define idx             %r8
+#define DWORD_idx       %r8d
+#define unused_lanes    %rbx
+#define lane_data       %rbx
+#define tmp2            %rbx
+#define tmp2_w          %ebx
+#define job_rax         %rax
+#define tmp1            %rax
+#define size_offset     %rax
+#define tmp             %rax
+#define start_offset    %rax
+#define tmp3            %arg1
+#define extra_blocks    %arg2
+#define p               %arg2
+# STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE  = 10*16
+_GPR_SAVE_SIZE  = 8*8
+_ALIGN_SIZE     = 8
+_XMM_SAVE       = 0
+_GPR_SAVE       = _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE     = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+.macro JNE_SKIP i
+jne     skip_\i
+.endm
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha1_mb_mgr_flush_avx2)
+        mov     %rsp, %r10
+        sub     $STACK_SPACE, %rsp
+        and     $~31, %rsp
+        mov     %rbx, _GPR_SAVE(%rsp)
+        mov     %r10, _GPR_SAVE+8*1(%rsp) #save rsp
+        mov     %rbp, _GPR_SAVE+8*3(%rsp)
+        mov     %r12, _GPR_SAVE+8*4(%rsp)
+        mov     %r13, _GPR_SAVE+8*5(%rsp)
+        mov     %r14, _GPR_SAVE+8*6(%rsp)
+        mov     %r15, _GPR_SAVE+8*7(%rsp)
+        # If bit (32+3) is set, then all lanes are empty
+        mov     _unused_lanes(state), unused_lanes
+        bt      $32+3, unused_lanes
+        jc      return_null
+        # find a lane with a non-null job
+        xor     idx, idx
+        offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  one(%rip), idx
+        offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  two(%rip), idx
+        offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  three(%rip), idx
+        offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  four(%rip), idx
+        offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  five(%rip), idx
+        offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  six(%rip), idx
+        offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+        cmovne  seven(%rip), idx
+        # copy idx to empty lanes
+copy_lane_data:
+        offset =  (_args + _data_ptr)
+        mov     offset(state,idx,8), tmp
+        I = 0
+.rep 8
+        offset =  (_ldata + I * _LANE_DATA_size + _job_in_lane)
+        cmpq    $0, offset(state)
+.altmacro
+        JNE_SKIP %I
+        offset =  (_args + _data_ptr + 8*I)
+        mov     tmp, offset(state)
+        offset =  (_lens + 4*I)
+        movl    $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+        I = (I+1)
+.noaltmacro
+.endr
+        # Find min length
+        vmovdqa _lens+0*16(state), %xmm0
+        vmovdqa _lens+1*16(state), %xmm1
+        vpminud %xmm1, %xmm0, %xmm2     # xmm2 has {D,C,B,A}
+        vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
+        vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
+        vpalignr $4, %xmm2, %xmm3, %xmm3    # xmm3 has {x,x,x,E}
+        vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
+        vmovd   %xmm2, DWORD_idx
+        mov     idx, len2
+        and     $0xF, idx
+        shr     $4, len2
+        jz      len_is_0
+        vpand   clear_low_nibble(%rip), %xmm2, %xmm2
+        vpshufd $0, %xmm2, %xmm2
+        vpsubd  %xmm2, %xmm0, %xmm0
+        vpsubd  %xmm2, %xmm1, %xmm1
+        vmovdqa %xmm0, _lens+0*16(state)
+        vmovdqa %xmm1, _lens+1*16(state)
+        # "state" and "args" are the same address, arg1
+        # len is arg2
+        call    sha1_x8_avx2
+        # state and idx are intact
+len_is_0:
+        # process completed job "idx"
+        imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+        mov     _job_in_lane(lane_data), job_rax
+        movq    $0, _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        mov     _unused_lanes(state), unused_lanes
+        shl     $4, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+        movl    $0xFFFFFFFF, _lens(state, idx, 4)
+        vmovd    _args_digest(state , idx, 4) , %xmm0
+        vpinsrd  $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+        vpinsrd  $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+        vpinsrd  $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+        movl    _args_digest+4*32(state, idx, 4), tmp2_w
+        vmovdqu  %xmm0, _result_digest(job_rax)
+        offset =  (_result_digest + 1*16)
+        mov     tmp2_w, offset(job_rax)
+return:
+        mov     _GPR_SAVE(%rsp), %rbx
+        mov     _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
+        mov     _GPR_SAVE+8*3(%rsp), %rbp
+        mov     _GPR_SAVE+8*4(%rsp), %r12
+        mov     _GPR_SAVE+8*5(%rsp), %r13
+        mov     _GPR_SAVE+8*6(%rsp), %r14
+        mov     _GPR_SAVE+8*7(%rsp), %r15
+        mov     %r10, %rsp
+        ret
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+ENDPROC(sha1_mb_mgr_flush_avx2)
+#################################################################
+.align 16
+ENTRY(sha1_mb_mgr_get_comp_job_avx2)
+        push    %rbx
+        ## if bit 32+3 is set, then all lanes are empty
+        mov     _unused_lanes(state), unused_lanes
+        bt      $(32+3), unused_lanes
+        jc      .return_null
+        # Find min length
+        vmovdqa _lens(state), %xmm0
+        vmovdqa _lens+1*16(state), %xmm1
+        vpminud %xmm1, %xmm0, %xmm2        # xmm2 has {D,C,B,A}
+        vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
+        vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
+        vpalignr $4, %xmm2, %xmm3, %xmm3    # xmm3 has {x,x,x,E}
+        vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
+        vmovd   %xmm2, DWORD_idx
+        test    $~0xF, idx
+        jnz     .return_null
+        # process completed job "idx"
+        imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+        mov     _job_in_lane(lane_data), job_rax
+        movq    $0,  _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        mov     _unused_lanes(state), unused_lanes
+        shl     $4, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+        movl    $0xFFFFFFFF, _lens(state,  idx, 4)
+        vmovd   _args_digest(state, idx, 4), %xmm0
+        vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+        vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+        vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+        movl    _args_digest+4*32(state, idx, 4), tmp2_w
+        vmovdqu %xmm0, _result_digest(job_rax)
+        movl    tmp2_w, _result_digest+1*16(job_rax)
+        pop     %rbx
+        ret
+.return_null:
+        xor     job_rax, job_rax
+        pop     %rbx
+        ret
+ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
+.data
+.align 16
+clear_low_nibble:
+.octa   0x000000000000000000000000FFFFFFF0
+one:
+.quad  1
+two:
+.quad  2
+three:
+.quad  3
+four:
+.quad  4
+five:
+.quad  5
+six:
+.quad  6
+seven:
+.quad  7
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..4ca7e166a2aa
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Initialization code for multi buffer SHA1 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "sha_mb_mgr.h"
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
+{
+        unsigned int j;
+        state->unused_lanes = 0xF76543210;
+        for (j = 0; j < 8; j++) {
+                state->lens[j] = 0xFFFFFFFF;
+                state->ldata[j].job_in_lane = NULL;
+        }
+}
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..2ab9560b53c8
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,228 @@
+/*
+ * Buffer submit code for multi buffer SHA1 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  Contact Information:
+ *      James Guilford <james.guilford@intel.com>
+ *      Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ *  BSD LICENSE
+ *
+ *  Copyright(c) 2014 Intel Corporation.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+.extern sha1_x8_avx
+# LINUX register definitions
+arg1    = %rdi
+arg2    = %rsi
+size_offset     = %rcx
+tmp2            = %rcx
+extra_blocks    = %rdx
+# Common definitions
+#define state   arg1
+#define job     %rsi
+#define len2    arg2
+#define p2      arg2
+# idx must be a register not clobberred by sha1_x8_avx2
+idx             = %r8
+DWORD_idx       = %r8d
+last_len        = %r8
+p               = %r11
+start_offset    = %r11
+unused_lanes    = %rbx
+BYTE_unused_lanes = %bl
+job_rax         = %rax
+len             = %rax
+DWORD_len       = %eax
+lane            = %rbp
+tmp3            = %rbp
+tmp             = %r9
+DWORD_tmp       = %r9d
+lane_data       = %r10
+# STACK_SPACE needs to be an odd multiple of 8
+STACK_SPACE     = 8*8 + 16*10 + 8
+# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha1_mb_mgr_submit_avx2)
+        mov     %rsp, %r10
+        sub     $STACK_SPACE, %rsp
+        and     $~31, %rsp
+        mov     %rbx, (%rsp)
+        mov     %r10, 8*2(%rsp) #save old rsp
+        mov     %rbp, 8*3(%rsp)
+        mov     %r12, 8*4(%rsp)
+        mov     %r13, 8*5(%rsp)
+        mov     %r14, 8*6(%rsp)
+        mov     %r15, 8*7(%rsp)
+        mov     _unused_lanes(state), unused_lanes
+        mov     unused_lanes, lane
+        and     $0xF, lane
+        shr     $4, unused_lanes
+        imul    $_LANE_DATA_size, lane, lane_data
+        movl    $STS_BEING_PROCESSED, _status(job)
+        lea     _ldata(state, lane_data), lane_data
+        mov     unused_lanes, _unused_lanes(state)
+        movl    _len(job),  DWORD_len
+        mov     job, _job_in_lane(lane_data)
+        shl     $4, len
+        or      lane, len
+        movl    DWORD_len,  _lens(state , lane, 4)
+        # Load digest words from result_digest
+        vmovdqu _result_digest(job), %xmm0
+        mov     _result_digest+1*16(job), DWORD_tmp
+        vmovd    %xmm0, _args_digest(state, lane, 4)
+        vpextrd  $1, %xmm0, _args_digest+1*32(state , lane, 4)
+        vpextrd  $2, %xmm0, _args_digest+2*32(state , lane, 4)
+        vpextrd  $3, %xmm0, _args_digest+3*32(state , lane, 4)
+        movl    DWORD_tmp, _args_digest+4*32(state , lane, 4)
+        mov     _buffer(job), p
+        mov     p, _args_data_ptr(state, lane, 8)
+        cmp     $0xF, unused_lanes
+        jne     return_null
+start_loop:
+        # Find min length
+        vmovdqa _lens(state), %xmm0
+        vmovdqa _lens+1*16(state), %xmm1
+        vpminud %xmm1, %xmm0, %xmm2        # xmm2 has {D,C,B,A}
+        vpalignr $8, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,D,C}
+        vpminud %xmm3, %xmm2, %xmm2        # xmm2 has {x,x,E,F}
+        vpalignr $4, %xmm2, %xmm3, %xmm3   # xmm3 has {x,x,x,E}
+        vpminud %xmm3, %xmm2, %xmm2        # xmm2 has min value in low dword
+        vmovd   %xmm2, DWORD_idx
+        mov    idx, len2
+        and    $0xF, idx
+        shr    $4, len2
+        jz     len_is_0
+        vpand   clear_low_nibble(%rip), %xmm2, %xmm2
+        vpshufd $0, %xmm2, %xmm2
+        vpsubd  %xmm2, %xmm0, %xmm0
+        vpsubd  %xmm2, %xmm1, %xmm1
+        vmovdqa %xmm0, _lens + 0*16(state)
+        vmovdqa %xmm1, _lens + 1*16(state)
+        # "state" and "args" are the same address, arg1
+        # len is arg2
+        call    sha1_x8_avx2
+        # state and idx are intact
+len_is_0:
+        # process completed job "idx"
+        imul    $_LANE_DATA_size, idx, lane_data
+        lea     _ldata(state, lane_data), lane_data
+        mov     _job_in_lane(lane_data), job_rax
+        mov     _unused_lanes(state), unused_lanes
+        movq    $0, _job_in_lane(lane_data)
+        movl    $STS_COMPLETED, _status(job_rax)
+        shl     $4, unused_lanes
+        or      idx, unused_lanes
+        mov     unused_lanes, _unused_lanes(state)
+        movl    $0xFFFFFFFF, _lens(state, idx, 4)
+        vmovd    _args_digest(state, idx, 4), %xmm0
+        vpinsrd  $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+        vpinsrd  $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+        vpinsrd  $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+        movl    4*32(state, idx, 4), DWORD_tmp
+        vmovdqu  %xmm0, _result_digest(job_rax)
+        movl    DWORD_tmp, _result_digest+1*16(job_rax)
+return:
+        mov     (%rsp), %rbx
+        mov     8*2(%rsp), %r10 #save old rsp
+        mov     8*3(%rsp), %rbp
+        mov     8*4(%rsp), %r12
+        mov     8*5(%rsp), %r13
+        mov     8*6(%rsp), %r14
+        mov     8*7(%rsp), %r15
+        mov     %r10, %rsp
+        ret
+return_null:
+        xor     job_rax, job_rax
+        jmp     return
+ENDPROC(sha1_mb_mgr_submit_avx2)
+.data
+.align 16
+clear_low_nibble:
+        .octa   0x000000000000000000000000FFFFFFF0
author	Tim Chen <tim.c.chen@linux.intel.com>	2014-07-31 13:29:57 -0400
committer	Herbert Xu <herbert@gondor.apana.org.au>	2014-08-25 08:32:28 -0400
commit	2249cbb53ead12539c4ab7f422400e82263d174b (patch)
tree	ede26e6abda14de4966fde522c8e1fdd0f0e2bd2 /arch/x86/crypto
parent	116177782392739f06868cfc2e6df5267aec4639 (diff)

diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S new file mode 100644 index 000000000000..85c4e1cf7172 --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,327 @@
	1	/*
	2	* Flush routine for SHA1 multibuffer
	3	*
	4	* This file is provided under a dual BSD/GPLv2 license. When using or
	5	* redistributing this file, you may do so under either license.
	6	*
	7	* GPL LICENSE SUMMARY
	8	*
	9	* Copyright(c) 2014 Intel Corporation.
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of version 2 of the GNU General Public License as
	13	* published by the Free Software Foundation.
	14	*
	15	* This program is distributed in the hope that it will be useful, but
	16	* WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* General Public License for more details.
	19	*
	20	* Contact Information:
	21	* James Guilford <james.guilford@intel.com>
	22	* Tim Chen <tim.c.chen@linux.intel.com>
	23	*
	24	* BSD LICENSE
	25	*
	26	* Copyright(c) 2014 Intel Corporation.
	27	*
	28	* Redistribution and use in source and binary forms, with or without
	29	* modification, are permitted provided that the following conditions
	30	* are met:
	31	*
	32	* * Redistributions of source code must retain the above copyright
	33	* notice, this list of conditions and the following disclaimer.
	34	* * Redistributions in binary form must reproduce the above copyright
	35	* notice, this list of conditions and the following disclaimer in
	36	* the documentation and/or other materials provided with the
	37	* distribution.
	38	* * Neither the name of Intel Corporation nor the names of its
	39	* contributors may be used to endorse or promote products derived
	40	* from this software without specific prior written permission.
	41	*
	42	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	43	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	44	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	45	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	46	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	47	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	48	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	49	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	50	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	51	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	52	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	53	*/
	54	#include <linux/linkage.h>
	55	#include "sha1_mb_mgr_datastruct.S"
	56
	57
	58	.extern sha1_x8_avx2
	59
	60	# LINUX register definitions
	61	#define arg1 %rdi
	62	#define arg2 %rsi
	63
	64	# Common definitions
	65	#define state arg1
	66	#define job arg2
	67	#define len2 arg2
	68
	69	# idx must be a register not clobbered by sha1_x8_avx2
	70	#define idx %r8
	71	#define DWORD_idx %r8d
	72
	73	#define unused_lanes %rbx
	74	#define lane_data %rbx
	75	#define tmp2 %rbx
	76	#define tmp2_w %ebx
	77
	78	#define job_rax %rax
	79	#define tmp1 %rax
	80	#define size_offset %rax
	81	#define tmp %rax
	82	#define start_offset %rax
	83
	84	#define tmp3 %arg1
	85
	86	#define extra_blocks %arg2
	87	#define p %arg2
	88
	89
	90	# STACK_SPACE needs to be an odd multiple of 8
	91	_XMM_SAVE_SIZE = 10*16
	92	_GPR_SAVE_SIZE = 8*8
	93	_ALIGN_SIZE = 8
	94
	95	_XMM_SAVE = 0
	96	_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE
	97	STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
	98
	99	.macro LABEL prefix n
	100	\prefix\n\():
	101	.endm
	102
	103	.macro JNE_SKIP i
	104	jne skip_\i
	105	.endm
	106
	107	.altmacro
	108	.macro SET_OFFSET _offset
	109	offset = \_offset
	110	.endm
	111	.noaltmacro
	112
	113	# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
	114	# arg 1 : rcx : state
	115	ENTRY(sha1_mb_mgr_flush_avx2)
	116	mov %rsp, %r10
	117	sub $STACK_SPACE, %rsp
	118	and $~31, %rsp
	119	mov %rbx, _GPR_SAVE(%rsp)
	120	mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp
	121	mov %rbp, _GPR_SAVE+8*3(%rsp)
	122	mov %r12, _GPR_SAVE+8*4(%rsp)
	123	mov %r13, _GPR_SAVE+8*5(%rsp)
	124	mov %r14, _GPR_SAVE+8*6(%rsp)
	125	mov %r15, _GPR_SAVE+8*7(%rsp)
	126
	127	# If bit (32+3) is set, then all lanes are empty
	128	mov _unused_lanes(state), unused_lanes
	129	bt $32+3, unused_lanes
	130	jc return_null
	131
	132	# find a lane with a non-null job
	133	xor idx, idx
	134	offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
	135	cmpq $0, offset(state)
	136	cmovne one(%rip), idx
	137	offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
	138	cmpq $0, offset(state)
	139	cmovne two(%rip), idx
	140	offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
	141	cmpq $0, offset(state)
	142	cmovne three(%rip), idx
	143	offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
	144	cmpq $0, offset(state)
	145	cmovne four(%rip), idx
	146	offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
	147	cmpq $0, offset(state)
	148	cmovne five(%rip), idx
	149	offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
	150	cmpq $0, offset(state)
	151	cmovne six(%rip), idx
	152	offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
	153	cmpq $0, offset(state)
	154	cmovne seven(%rip), idx
	155
	156	# copy idx to empty lanes
	157	copy_lane_data:
	158	offset = (_args + _data_ptr)
	159	mov offset(state,idx,8), tmp
	160
	161	I = 0
	162	.rep 8
	163	offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
	164	cmpq $0, offset(state)
	165	.altmacro
	166	JNE_SKIP %I
	167	offset = (_args + _data_ptr + 8*I)
	168	mov tmp, offset(state)
	169	offset = (_lens + 4*I)
	170	movl $0xFFFFFFFF, offset(state)
	171	LABEL skip_ %I
	172	I = (I+1)
	173	.noaltmacro
	174	.endr
	175
	176	# Find min length
	177	vmovdqa _lens+0*16(state), %xmm0
	178	vmovdqa _lens+1*16(state), %xmm1
	179
	180	vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
	181	vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
	182	vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
	183	vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
	184	vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
	185
	186	vmovd %xmm2, DWORD_idx
	187	mov idx, len2
	188	and $0xF, idx
	189	shr $4, len2
	190	jz len_is_0
	191
	192	vpand clear_low_nibble(%rip), %xmm2, %xmm2
	193	vpshufd $0, %xmm2, %xmm2
	194
	195	vpsubd %xmm2, %xmm0, %xmm0
	196	vpsubd %xmm2, %xmm1, %xmm1
	197
	198	vmovdqa %xmm0, _lens+0*16(state)
	199	vmovdqa %xmm1, _lens+1*16(state)
	200
	201	# "state" and "args" are the same address, arg1
	202	# len is arg2
	203	call sha1_x8_avx2
	204	# state and idx are intact
	205
	206
	207	len_is_0:
	208	# process completed job "idx"
	209	imul $_LANE_DATA_size, idx, lane_data
	210	lea _ldata(state, lane_data), lane_data
	211
	212	mov _job_in_lane(lane_data), job_rax
	213	movq $0, _job_in_lane(lane_data)
	214	movl $STS_COMPLETED, _status(job_rax)
	215	mov _unused_lanes(state), unused_lanes
	216	shl $4, unused_lanes
	217	or idx, unused_lanes
	218	mov unused_lanes, _unused_lanes(state)
	219
	220	movl $0xFFFFFFFF, _lens(state, idx, 4)
	221
	222	vmovd _args_digest(state , idx, 4) , %xmm0
	223	vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
	224	vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
	225	vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
	226	movl _args_digest+4*32(state, idx, 4), tmp2_w
	227
	228	vmovdqu %xmm0, _result_digest(job_rax)
	229	offset = (_result_digest + 1*16)
	230	mov tmp2_w, offset(job_rax)
	231
	232	return:
	233
	234	mov _GPR_SAVE(%rsp), %rbx
	235	mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
	236	mov _GPR_SAVE+8*3(%rsp), %rbp
	237	mov _GPR_SAVE+8*4(%rsp), %r12
	238	mov _GPR_SAVE+8*5(%rsp), %r13
	239	mov _GPR_SAVE+8*6(%rsp), %r14
	240	mov _GPR_SAVE+8*7(%rsp), %r15
	241	mov %r10, %rsp
	242
	243	ret
	244
	245	return_null:
	246	xor job_rax, job_rax
	247	jmp return
	248	ENDPROC(sha1_mb_mgr_flush_avx2)
	249
	250
	251	#################################################################
	252
	253	.align 16
	254	ENTRY(sha1_mb_mgr_get_comp_job_avx2)
	255	push %rbx
	256
	257	## if bit 32+3 is set, then all lanes are empty
	258	mov _unused_lanes(state), unused_lanes
	259	bt $(32+3), unused_lanes
	260	jc .return_null
	261
	262	# Find min length
	263	vmovdqa _lens(state), %xmm0
	264	vmovdqa _lens+1*16(state), %xmm1
	265
	266	vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
	267	vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
	268	vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
	269	vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
	270	vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
	271
	272	vmovd %xmm2, DWORD_idx
	273	test $~0xF, idx
	274	jnz .return_null
	275
	276	# process completed job "idx"
	277	imul $_LANE_DATA_size, idx, lane_data
	278	lea _ldata(state, lane_data), lane_data
	279
	280	mov _job_in_lane(lane_data), job_rax
	281	movq $0, _job_in_lane(lane_data)
	282	movl $STS_COMPLETED, _status(job_rax)
	283	mov _unused_lanes(state), unused_lanes
	284	shl $4, unused_lanes
	285	or idx, unused_lanes
	286	mov unused_lanes, _unused_lanes(state)
	287
	288	movl $0xFFFFFFFF, _lens(state, idx, 4)
	289
	290	vmovd _args_digest(state, idx, 4), %xmm0
	291	vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
	292	vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
	293	vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
	294	movl _args_digest+4*32(state, idx, 4), tmp2_w
	295
	296	vmovdqu %xmm0, _result_digest(job_rax)
	297	movl tmp2_w, _result_digest+1*16(job_rax)
	298
	299	pop %rbx
	300
	301	ret
	302
	303	.return_null:
	304	xor job_rax, job_rax
	305	pop %rbx
	306	ret
	307	ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
	308
	309	.data
	310
	311	.align 16
	312	clear_low_nibble:
	313	.octa 0x000000000000000000000000FFFFFFF0
	314	one:
	315	.quad 1
	316	two:
	317	.quad 2
	318	three:
	319	.quad 3
	320	four:
	321	.quad 4
	322	five:
	323	.quad 5
	324	six:
	325	.quad 6
	326	seven:
	327	.quad 7


diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c new file mode 100644 index 000000000000..4ca7e166a2aa --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
	1	/*
	2	* Initialization code for multi buffer SHA1 algorithm for AVX2
	3	*
	4	* This file is provided under a dual BSD/GPLv2 license. When using or
	5	* redistributing this file, you may do so under either license.
	6	*
	7	* GPL LICENSE SUMMARY
	8	*
	9	* Copyright(c) 2014 Intel Corporation.
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of version 2 of the GNU General Public License as
	13	* published by the Free Software Foundation.
	14	*
	15	* This program is distributed in the hope that it will be useful, but
	16	* WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* General Public License for more details.
	19	*
	20	* Contact Information:
	21	* Tim Chen <tim.c.chen@linux.intel.com>
	22	*
	23	* BSD LICENSE
	24	*
	25	* Copyright(c) 2014 Intel Corporation.
	26	*
	27	* Redistribution and use in source and binary forms, with or without
	28	* modification, are permitted provided that the following conditions
	29	* are met:
	30	*
	31	* * Redistributions of source code must retain the above copyright
	32	* notice, this list of conditions and the following disclaimer.
	33	* * Redistributions in binary form must reproduce the above copyright
	34	* notice, this list of conditions and the following disclaimer in
	35	* the documentation and/or other materials provided with the
	36	* distribution.
	37	* * Neither the name of Intel Corporation nor the names of its
	38	* contributors may be used to endorse or promote products derived
	39	* from this software without specific prior written permission.
	40	*
	41	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	42	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	43	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	44	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	45	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	46	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	47	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	48	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	49	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	50	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	51	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	52	*/
	53
	54	#include "sha_mb_mgr.h"
	55
	56	void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
	57	{
	58	unsigned int j;
	59	state->unused_lanes = 0xF76543210;
	60	for (j = 0; j < 8; j++) {
	61	state->lens[j] = 0xFFFFFFFF;
	62	state->ldata[j].job_in_lane = NULL;
	63	}
	64	}


diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S new file mode 100644 index 000000000000..2ab9560b53c8 --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,228 @@
	1	/*
	2	* Buffer submit code for multi buffer SHA1 algorithm
	3	*
	4	* This file is provided under a dual BSD/GPLv2 license. When using or
	5	* redistributing this file, you may do so under either license.
	6	*
	7	* GPL LICENSE SUMMARY
	8	*
	9	* Copyright(c) 2014 Intel Corporation.
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of version 2 of the GNU General Public License as
	13	* published by the Free Software Foundation.
	14	*
	15	* This program is distributed in the hope that it will be useful, but
	16	* WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* General Public License for more details.
	19	*
	20	* Contact Information:
	21	* James Guilford <james.guilford@intel.com>
	22	* Tim Chen <tim.c.chen@linux.intel.com>
	23	*
	24	* BSD LICENSE
	25	*
	26	* Copyright(c) 2014 Intel Corporation.
	27	*
	28	* Redistribution and use in source and binary forms, with or without
	29	* modification, are permitted provided that the following conditions
	30	* are met:
	31	*
	32	* * Redistributions of source code must retain the above copyright
	33	* notice, this list of conditions and the following disclaimer.
	34	* * Redistributions in binary form must reproduce the above copyright
	35	* notice, this list of conditions and the following disclaimer in
	36	* the documentation and/or other materials provided with the
	37	* distribution.
	38	* * Neither the name of Intel Corporation nor the names of its
	39	* contributors may be used to endorse or promote products derived
	40	* from this software without specific prior written permission.
	41	*
	42	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	43	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	44	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	45	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	46	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	47	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	48	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	49	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	50	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	51	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	52	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	53	*/
	54
	55	#include <linux/linkage.h>
	56	#include "sha1_mb_mgr_datastruct.S"
	57
	58
	59	.extern sha1_x8_avx
	60
	61	# LINUX register definitions
	62	arg1 = %rdi
	63	arg2 = %rsi
	64	size_offset = %rcx
	65	tmp2 = %rcx
	66	extra_blocks = %rdx
	67
	68	# Common definitions
	69	#define state arg1
	70	#define job %rsi
	71	#define len2 arg2
	72	#define p2 arg2
	73
	74	# idx must be a register not clobberred by sha1_x8_avx2
	75	idx = %r8
	76	DWORD_idx = %r8d
	77	last_len = %r8
	78
	79	p = %r11
	80	start_offset = %r11
	81
	82	unused_lanes = %rbx
	83	BYTE_unused_lanes = %bl
	84
	85	job_rax = %rax
	86	len = %rax
	87	DWORD_len = %eax
	88
	89	lane = %rbp
	90	tmp3 = %rbp
	91
	92	tmp = %r9
	93	DWORD_tmp = %r9d
	94
	95	lane_data = %r10
	96
	97	# STACK_SPACE needs to be an odd multiple of 8
	98	STACK_SPACE = 88 + 1610 + 8
	99
	100	# JOB* submit_mb_mgr_submit_avx2(MB_MGR state, job_sha1 job)
	101	# arg 1 : rcx : state
	102	# arg 2 : rdx : job
	103	ENTRY(sha1_mb_mgr_submit_avx2)
	104
	105	mov %rsp, %r10
	106	sub $STACK_SPACE, %rsp
	107	and $~31, %rsp
	108
	109	mov %rbx, (%rsp)
	110	mov %r10, 8*2(%rsp) #save old rsp
	111	mov %rbp, 8*3(%rsp)
	112	mov %r12, 8*4(%rsp)
	113	mov %r13, 8*5(%rsp)
	114	mov %r14, 8*6(%rsp)
	115	mov %r15, 8*7(%rsp)
	116
	117	mov _unused_lanes(state), unused_lanes
	118	mov unused_lanes, lane
	119	and $0xF, lane
	120	shr $4, unused_lanes
	121	imul $_LANE_DATA_size, lane, lane_data
	122	movl $STS_BEING_PROCESSED, _status(job)
	123	lea _ldata(state, lane_data), lane_data
	124	mov unused_lanes, _unused_lanes(state)
	125	movl _len(job), DWORD_len
	126
	127	mov job, _job_in_lane(lane_data)
	128	shl $4, len
	129	or lane, len
	130
	131	movl DWORD_len, _lens(state , lane, 4)
	132
	133	# Load digest words from result_digest
	134	vmovdqu _result_digest(job), %xmm0
	135	mov _result_digest+1*16(job), DWORD_tmp
	136	vmovd %xmm0, _args_digest(state, lane, 4)
	137	vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
	138	vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
	139	vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
	140	movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
	141
	142	mov _buffer(job), p
	143	mov p, _args_data_ptr(state, lane, 8)
	144
	145	cmp $0xF, unused_lanes
	146	jne return_null
	147
	148	start_loop:
	149	# Find min length
	150	vmovdqa _lens(state), %xmm0
	151	vmovdqa _lens+1*16(state), %xmm1
	152
	153	vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
	154	vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
	155	vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
	156	vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
	157	vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
	158
	159	vmovd %xmm2, DWORD_idx
	160	mov idx, len2
	161	and $0xF, idx
	162	shr $4, len2
	163	jz len_is_0
	164
	165	vpand clear_low_nibble(%rip), %xmm2, %xmm2
	166	vpshufd $0, %xmm2, %xmm2
	167
	168	vpsubd %xmm2, %xmm0, %xmm0
	169	vpsubd %xmm2, %xmm1, %xmm1
	170
	171	vmovdqa %xmm0, _lens + 0*16(state)
	172	vmovdqa %xmm1, _lens + 1*16(state)
	173
	174
	175	# "state" and "args" are the same address, arg1
	176	# len is arg2
	177	call sha1_x8_avx2
	178
	179	# state and idx are intact
	180
	181	len_is_0:
	182	# process completed job "idx"
	183	imul $_LANE_DATA_size, idx, lane_data
	184	lea _ldata(state, lane_data), lane_data
	185
	186	mov _job_in_lane(lane_data), job_rax
	187	mov _unused_lanes(state), unused_lanes
	188	movq $0, _job_in_lane(lane_data)
	189	movl $STS_COMPLETED, _status(job_rax)
	190	shl $4, unused_lanes
	191	or idx, unused_lanes
	192	mov unused_lanes, _unused_lanes(state)
	193
	194	movl $0xFFFFFFFF, _lens(state, idx, 4)
	195
	196	vmovd _args_digest(state, idx, 4), %xmm0
	197	vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
	198	vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
	199	vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
	200	movl 4*32(state, idx, 4), DWORD_tmp
	201
	202	vmovdqu %xmm0, _result_digest(job_rax)
	203	movl DWORD_tmp, _result_digest+1*16(job_rax)
	204
	205	return:
	206
	207	mov (%rsp), %rbx
	208	mov 8*2(%rsp), %r10 #save old rsp
	209	mov 8*3(%rsp), %rbp
	210	mov 8*4(%rsp), %r12
	211	mov 8*5(%rsp), %r13
	212	mov 8*6(%rsp), %r14
	213	mov 8*7(%rsp), %r15
	214	mov %r10, %rsp
	215
	216	ret
	217
	218	return_null:
	219	xor job_rax, job_rax
	220	jmp return
	221
	222	ENDPROC(sha1_mb_mgr_submit_avx2)
	223
	224	.data
	225
	226	.align 16
	227	clear_low_nibble:
	228	.octa 0x000000000000000000000000FFFFFFF0