aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorTim Chen <tim.c.chen@linux.intel.com>2014-07-31 13:29:57 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2014-08-25 08:32:28 -0400
commit2249cbb53ead12539c4ab7f422400e82263d174b (patch)
treeede26e6abda14de4966fde522c8e1fdd0f0e2bd2 /arch/x86/crypto
parent116177782392739f06868cfc2e6df5267aec4639 (diff)
crypto: sha-mb - SHA1 multibuffer submit and flush routines for AVX2
This patch introduces the routines used to submit and flush buffers belonging to SHA1 crypto jobs to the SHA1 multibuffer algorithm. It is implemented mostly in assembly optimized with AVX2 instructions. Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S327
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c64
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S228
3 files changed, 619 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..85c4e1cf7172
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,327 @@
1/*
2 * Flush routine for SHA1 multibuffer
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * James Guilford <james.guilford@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2014 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54#include <linux/linkage.h>
55#include "sha1_mb_mgr_datastruct.S"
56
57
58.extern sha1_x8_avx2
59
60# LINUX register definitions
61#define arg1 %rdi
62#define arg2 %rsi
63
64# Common definitions
65#define state arg1
66#define job arg2
67#define len2 arg2
68
69# idx must be a register not clobbered by sha1_x8_avx2
70#define idx %r8
71#define DWORD_idx %r8d
72
73#define unused_lanes %rbx
74#define lane_data %rbx
75#define tmp2 %rbx
76#define tmp2_w %ebx
77
78#define job_rax %rax
79#define tmp1 %rax
80#define size_offset %rax
81#define tmp %rax
82#define start_offset %rax
83
84#define tmp3 %arg1
85
86#define extra_blocks %arg2
87#define p %arg2
88
89
90# STACK_SPACE needs to be an odd multiple of 8
91_XMM_SAVE_SIZE = 10*16
92_GPR_SAVE_SIZE = 8*8
93_ALIGN_SIZE = 8
94
95_XMM_SAVE = 0
96_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE
97STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
98
99.macro LABEL prefix n
100\prefix\n\():
101.endm
102
103.macro JNE_SKIP i
104jne skip_\i
105.endm
106
107.altmacro
108.macro SET_OFFSET _offset
109offset = \_offset
110.endm
111.noaltmacro
112
113# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
114# arg 1 : rcx : state
115ENTRY(sha1_mb_mgr_flush_avx2)
116 mov %rsp, %r10
117 sub $STACK_SPACE, %rsp
118 and $~31, %rsp
119 mov %rbx, _GPR_SAVE(%rsp)
120 mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp
121 mov %rbp, _GPR_SAVE+8*3(%rsp)
122 mov %r12, _GPR_SAVE+8*4(%rsp)
123 mov %r13, _GPR_SAVE+8*5(%rsp)
124 mov %r14, _GPR_SAVE+8*6(%rsp)
125 mov %r15, _GPR_SAVE+8*7(%rsp)
126
127 # If bit (32+3) is set, then all lanes are empty
128 mov _unused_lanes(state), unused_lanes
129 bt $32+3, unused_lanes
130 jc return_null
131
132 # find a lane with a non-null job
133 xor idx, idx
134 offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
135 cmpq $0, offset(state)
136 cmovne one(%rip), idx
137 offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
138 cmpq $0, offset(state)
139 cmovne two(%rip), idx
140 offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
141 cmpq $0, offset(state)
142 cmovne three(%rip), idx
143 offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
144 cmpq $0, offset(state)
145 cmovne four(%rip), idx
146 offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
147 cmpq $0, offset(state)
148 cmovne five(%rip), idx
149 offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
150 cmpq $0, offset(state)
151 cmovne six(%rip), idx
152 offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
153 cmpq $0, offset(state)
154 cmovne seven(%rip), idx
155
156 # copy idx to empty lanes
157copy_lane_data:
158 offset = (_args + _data_ptr)
159 mov offset(state,idx,8), tmp
160
161 I = 0
162.rep 8
163 offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
164 cmpq $0, offset(state)
165.altmacro
166 JNE_SKIP %I
167 offset = (_args + _data_ptr + 8*I)
168 mov tmp, offset(state)
169 offset = (_lens + 4*I)
170 movl $0xFFFFFFFF, offset(state)
171LABEL skip_ %I
172 I = (I+1)
173.noaltmacro
174.endr
175
176 # Find min length
177 vmovdqa _lens+0*16(state), %xmm0
178 vmovdqa _lens+1*16(state), %xmm1
179
180 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
181 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
182 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
183 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
184 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
185
186 vmovd %xmm2, DWORD_idx
187 mov idx, len2
188 and $0xF, idx
189 shr $4, len2
190 jz len_is_0
191
192 vpand clear_low_nibble(%rip), %xmm2, %xmm2
193 vpshufd $0, %xmm2, %xmm2
194
195 vpsubd %xmm2, %xmm0, %xmm0
196 vpsubd %xmm2, %xmm1, %xmm1
197
198 vmovdqa %xmm0, _lens+0*16(state)
199 vmovdqa %xmm1, _lens+1*16(state)
200
201 # "state" and "args" are the same address, arg1
202 # len is arg2
203 call sha1_x8_avx2
204 # state and idx are intact
205
206
207len_is_0:
208 # process completed job "idx"
209 imul $_LANE_DATA_size, idx, lane_data
210 lea _ldata(state, lane_data), lane_data
211
212 mov _job_in_lane(lane_data), job_rax
213 movq $0, _job_in_lane(lane_data)
214 movl $STS_COMPLETED, _status(job_rax)
215 mov _unused_lanes(state), unused_lanes
216 shl $4, unused_lanes
217 or idx, unused_lanes
218 mov unused_lanes, _unused_lanes(state)
219
220 movl $0xFFFFFFFF, _lens(state, idx, 4)
221
222 vmovd _args_digest(state , idx, 4) , %xmm0
223 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
224 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
225 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
226 movl _args_digest+4*32(state, idx, 4), tmp2_w
227
228 vmovdqu %xmm0, _result_digest(job_rax)
229 offset = (_result_digest + 1*16)
230 mov tmp2_w, offset(job_rax)
231
232return:
233
234 mov _GPR_SAVE(%rsp), %rbx
235 mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
236 mov _GPR_SAVE+8*3(%rsp), %rbp
237 mov _GPR_SAVE+8*4(%rsp), %r12
238 mov _GPR_SAVE+8*5(%rsp), %r13
239 mov _GPR_SAVE+8*6(%rsp), %r14
240 mov _GPR_SAVE+8*7(%rsp), %r15
241 mov %r10, %rsp
242
243 ret
244
245return_null:
246 xor job_rax, job_rax
247 jmp return
248ENDPROC(sha1_mb_mgr_flush_avx2)
249
250
251#################################################################
252
253.align 16
254ENTRY(sha1_mb_mgr_get_comp_job_avx2)
255 push %rbx
256
257 ## if bit 32+3 is set, then all lanes are empty
258 mov _unused_lanes(state), unused_lanes
259 bt $(32+3), unused_lanes
260 jc .return_null
261
262 # Find min length
263 vmovdqa _lens(state), %xmm0
264 vmovdqa _lens+1*16(state), %xmm1
265
266 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
267 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
268 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
269 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
270 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
271
272 vmovd %xmm2, DWORD_idx
273 test $~0xF, idx
274 jnz .return_null
275
276 # process completed job "idx"
277 imul $_LANE_DATA_size, idx, lane_data
278 lea _ldata(state, lane_data), lane_data
279
280 mov _job_in_lane(lane_data), job_rax
281 movq $0, _job_in_lane(lane_data)
282 movl $STS_COMPLETED, _status(job_rax)
283 mov _unused_lanes(state), unused_lanes
284 shl $4, unused_lanes
285 or idx, unused_lanes
286 mov unused_lanes, _unused_lanes(state)
287
288 movl $0xFFFFFFFF, _lens(state, idx, 4)
289
290 vmovd _args_digest(state, idx, 4), %xmm0
291 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
292 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
293 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
294 movl _args_digest+4*32(state, idx, 4), tmp2_w
295
296 vmovdqu %xmm0, _result_digest(job_rax)
297 movl tmp2_w, _result_digest+1*16(job_rax)
298
299 pop %rbx
300
301 ret
302
303.return_null:
304 xor job_rax, job_rax
305 pop %rbx
306 ret
307ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
308
309.data
310
311.align 16
312clear_low_nibble:
313.octa 0x000000000000000000000000FFFFFFF0
314one:
315.quad 1
316two:
317.quad 2
318three:
319.quad 3
320four:
321.quad 4
322five:
323.quad 5
324six:
325.quad 6
326seven:
327.quad 7
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..4ca7e166a2aa
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
1/*
2 * Initialization code for multi buffer SHA1 algorithm for AVX2
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Tim Chen <tim.c.chen@linux.intel.com>
22 *
23 * BSD LICENSE
24 *
25 * Copyright(c) 2014 Intel Corporation.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 *
31 * * Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * * Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in
35 * the documentation and/or other materials provided with the
36 * distribution.
37 * * Neither the name of Intel Corporation nor the names of its
38 * contributors may be used to endorse or promote products derived
39 * from this software without specific prior written permission.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 */
53
54#include "sha_mb_mgr.h"
55
56void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
57{
58 unsigned int j;
59 state->unused_lanes = 0xF76543210;
60 for (j = 0; j < 8; j++) {
61 state->lens[j] = 0xFFFFFFFF;
62 state->ldata[j].job_in_lane = NULL;
63 }
64}
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..2ab9560b53c8
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,228 @@
1/*
2 * Buffer submit code for multi buffer SHA1 algorithm
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * James Guilford <james.guilford@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2014 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55#include <linux/linkage.h>
56#include "sha1_mb_mgr_datastruct.S"
57
58
59.extern sha1_x8_avx
60
61# LINUX register definitions
62arg1 = %rdi
63arg2 = %rsi
64size_offset = %rcx
65tmp2 = %rcx
66extra_blocks = %rdx
67
68# Common definitions
69#define state arg1
70#define job %rsi
71#define len2 arg2
72#define p2 arg2
73
74# idx must be a register not clobberred by sha1_x8_avx2
75idx = %r8
76DWORD_idx = %r8d
77last_len = %r8
78
79p = %r11
80start_offset = %r11
81
82unused_lanes = %rbx
83BYTE_unused_lanes = %bl
84
85job_rax = %rax
86len = %rax
87DWORD_len = %eax
88
89lane = %rbp
90tmp3 = %rbp
91
92tmp = %r9
93DWORD_tmp = %r9d
94
95lane_data = %r10
96
97# STACK_SPACE needs to be an odd multiple of 8
98STACK_SPACE = 8*8 + 16*10 + 8
99
100# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
101# arg 1 : rcx : state
102# arg 2 : rdx : job
103ENTRY(sha1_mb_mgr_submit_avx2)
104
105 mov %rsp, %r10
106 sub $STACK_SPACE, %rsp
107 and $~31, %rsp
108
109 mov %rbx, (%rsp)
110 mov %r10, 8*2(%rsp) #save old rsp
111 mov %rbp, 8*3(%rsp)
112 mov %r12, 8*4(%rsp)
113 mov %r13, 8*5(%rsp)
114 mov %r14, 8*6(%rsp)
115 mov %r15, 8*7(%rsp)
116
117 mov _unused_lanes(state), unused_lanes
118 mov unused_lanes, lane
119 and $0xF, lane
120 shr $4, unused_lanes
121 imul $_LANE_DATA_size, lane, lane_data
122 movl $STS_BEING_PROCESSED, _status(job)
123 lea _ldata(state, lane_data), lane_data
124 mov unused_lanes, _unused_lanes(state)
125 movl _len(job), DWORD_len
126
127 mov job, _job_in_lane(lane_data)
128 shl $4, len
129 or lane, len
130
131 movl DWORD_len, _lens(state , lane, 4)
132
133 # Load digest words from result_digest
134 vmovdqu _result_digest(job), %xmm0
135 mov _result_digest+1*16(job), DWORD_tmp
136 vmovd %xmm0, _args_digest(state, lane, 4)
137 vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
138 vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
139 vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
140 movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
141
142 mov _buffer(job), p
143 mov p, _args_data_ptr(state, lane, 8)
144
145 cmp $0xF, unused_lanes
146 jne return_null
147
148start_loop:
149 # Find min length
150 vmovdqa _lens(state), %xmm0
151 vmovdqa _lens+1*16(state), %xmm1
152
153 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
154 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
155 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
156 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
157 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
158
159 vmovd %xmm2, DWORD_idx
160 mov idx, len2
161 and $0xF, idx
162 shr $4, len2
163 jz len_is_0
164
165 vpand clear_low_nibble(%rip), %xmm2, %xmm2
166 vpshufd $0, %xmm2, %xmm2
167
168 vpsubd %xmm2, %xmm0, %xmm0
169 vpsubd %xmm2, %xmm1, %xmm1
170
171 vmovdqa %xmm0, _lens + 0*16(state)
172 vmovdqa %xmm1, _lens + 1*16(state)
173
174
175 # "state" and "args" are the same address, arg1
176 # len is arg2
177 call sha1_x8_avx2
178
179 # state and idx are intact
180
181len_is_0:
182 # process completed job "idx"
183 imul $_LANE_DATA_size, idx, lane_data
184 lea _ldata(state, lane_data), lane_data
185
186 mov _job_in_lane(lane_data), job_rax
187 mov _unused_lanes(state), unused_lanes
188 movq $0, _job_in_lane(lane_data)
189 movl $STS_COMPLETED, _status(job_rax)
190 shl $4, unused_lanes
191 or idx, unused_lanes
192 mov unused_lanes, _unused_lanes(state)
193
194 movl $0xFFFFFFFF, _lens(state, idx, 4)
195
196 vmovd _args_digest(state, idx, 4), %xmm0
197 vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
198 vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
199 vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
200 movl 4*32(state, idx, 4), DWORD_tmp
201
202 vmovdqu %xmm0, _result_digest(job_rax)
203 movl DWORD_tmp, _result_digest+1*16(job_rax)
204
205return:
206
207 mov (%rsp), %rbx
208 mov 8*2(%rsp), %r10 #save old rsp
209 mov 8*3(%rsp), %rbp
210 mov 8*4(%rsp), %r12
211 mov 8*5(%rsp), %r13
212 mov 8*6(%rsp), %r14
213 mov 8*7(%rsp), %r15
214 mov %r10, %rsp
215
216 ret
217
218return_null:
219 xor job_rax, job_rax
220 jmp return
221
222ENDPROC(sha1_mb_mgr_submit_avx2)
223
224.data
225
226.align 16
227clear_low_nibble:
228 .octa 0x000000000000000000000000FFFFFFF0