aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorMegha Dey <megha.dey@intel.com>2016-06-23 21:40:44 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2016-06-27 04:57:44 -0400
commita377c6b1876e7ac847a124998e828baf9d8c89c2 (patch)
tree7a096cccdafe7c5aa6b743d7556814e1a644321e /arch/x86
parent9be7e24483998fa6a34c2b191c4798b8189f8f9e (diff)
crypto: sha256-mb - submit/flush routines for AVX2
This patch introduces the routines used to submit and flush buffers belonging to SHA256 crypto jobs to the SHA256 multibuffer algorithm. It is implemented mostly in assembly optimized with AVX2 instructions. Signed-off-by: Megha Dey <megha.dey@linux.intel.com> Reviewed-by: Fenghua Yu <fenghua.yu@intel.com> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S304
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c65
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S215
3 files changed, 584 insertions, 0 deletions
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..b691da981cd9
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -0,0 +1,304 @@
1/*
2 * Flush routine for SHA256 multibuffer
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2016 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Megha Dey <megha.dey@linux.intel.com>
22 *
23 * BSD LICENSE
24 *
25 * Copyright(c) 2016 Intel Corporation.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 *
31 * * Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * * Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in
35 * the documentation and/or other materials provided with the
36 * distribution.
37 * * Neither the name of Intel Corporation nor the names of its
38 * contributors may be used to endorse or promote products derived
39 * from this software without specific prior written permission.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 */
53#include <linux/linkage.h>
54#include <asm/frame.h>
55#include "sha256_mb_mgr_datastruct.S"
56
57.extern sha256_x8_avx2
58
59#LINUX register definitions
60#define arg1 %rdi
61#define arg2 %rsi
62
63# Common register definitions
64#define state arg1
65#define job arg2
66#define len2 arg2
67
68# idx must be a register not clobberred by sha1_mult
69#define idx %r8
70#define DWORD_idx %r8d
71
72#define unused_lanes %rbx
73#define lane_data %rbx
74#define tmp2 %rbx
75#define tmp2_w %ebx
76
77#define job_rax %rax
78#define tmp1 %rax
79#define size_offset %rax
80#define tmp %rax
81#define start_offset %rax
82
83#define tmp3 %arg1
84
85#define extra_blocks %arg2
86#define p %arg2
87
88.macro LABEL prefix n
89\prefix\n\():
90.endm
91
92.macro JNE_SKIP i
93jne skip_\i
94.endm
95
96.altmacro
97.macro SET_OFFSET _offset
98offset = \_offset
99.endm
100.noaltmacro
101
102# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
103# arg 1 : rcx : state
104ENTRY(sha256_mb_mgr_flush_avx2)
105 FRAME_BEGIN
106 push %rbx
107
108 # If bit (32+3) is set, then all lanes are empty
109 mov _unused_lanes(state), unused_lanes
110 bt $32+3, unused_lanes
111 jc return_null
112
113 # find a lane with a non-null job
114 xor idx, idx
115 offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
116 cmpq $0, offset(state)
117 cmovne one(%rip), idx
118 offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
119 cmpq $0, offset(state)
120 cmovne two(%rip), idx
121 offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
122 cmpq $0, offset(state)
123 cmovne three(%rip), idx
124 offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
125 cmpq $0, offset(state)
126 cmovne four(%rip), idx
127 offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
128 cmpq $0, offset(state)
129 cmovne five(%rip), idx
130 offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
131 cmpq $0, offset(state)
132 cmovne six(%rip), idx
133 offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
134 cmpq $0, offset(state)
135 cmovne seven(%rip), idx
136
137 # copy idx to empty lanes
138copy_lane_data:
139 offset = (_args + _data_ptr)
140 mov offset(state,idx,8), tmp
141
142 I = 0
143.rep 8
144 offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
145 cmpq $0, offset(state)
146.altmacro
147 JNE_SKIP %I
148 offset = (_args + _data_ptr + 8*I)
149 mov tmp, offset(state)
150 offset = (_lens + 4*I)
151 movl $0xFFFFFFFF, offset(state)
152LABEL skip_ %I
153 I = (I+1)
154.noaltmacro
155.endr
156
157 # Find min length
158 vmovdqa _lens+0*16(state), %xmm0
159 vmovdqa _lens+1*16(state), %xmm1
160
161 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
162 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
163 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
164 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
165 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
166
167 vmovd %xmm2, DWORD_idx
168 mov idx, len2
169 and $0xF, idx
170 shr $4, len2
171 jz len_is_0
172
173 vpand clear_low_nibble(%rip), %xmm2, %xmm2
174 vpshufd $0, %xmm2, %xmm2
175
176 vpsubd %xmm2, %xmm0, %xmm0
177 vpsubd %xmm2, %xmm1, %xmm1
178
179 vmovdqa %xmm0, _lens+0*16(state)
180 vmovdqa %xmm1, _lens+1*16(state)
181
182 # "state" and "args" are the same address, arg1
183 # len is arg2
184 call sha256_x8_avx2
185 # state and idx are intact
186
187len_is_0:
188 # process completed job "idx"
189 imul $_LANE_DATA_size, idx, lane_data
190 lea _ldata(state, lane_data), lane_data
191
192 mov _job_in_lane(lane_data), job_rax
193 movq $0, _job_in_lane(lane_data)
194 movl $STS_COMPLETED, _status(job_rax)
195 mov _unused_lanes(state), unused_lanes
196 shl $4, unused_lanes
197 or idx, unused_lanes
198
199 mov unused_lanes, _unused_lanes(state)
200 movl $0xFFFFFFFF, _lens(state,idx,4)
201
202 vmovd _args_digest(state , idx, 4) , %xmm0
203 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
204 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
205 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
206 vmovd _args_digest+4*32(state, idx, 4), %xmm1
207 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
208 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
209 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
210
211 vmovdqu %xmm0, _result_digest(job_rax)
212 offset = (_result_digest + 1*16)
213 vmovdqu %xmm1, offset(job_rax)
214
215return:
216 pop %rbx
217 FRAME_END
218 ret
219
220return_null:
221 xor job_rax, job_rax
222 jmp return
223ENDPROC(sha256_mb_mgr_flush_avx2)
224
225##############################################################################
226
227.align 16
228ENTRY(sha256_mb_mgr_get_comp_job_avx2)
229 push %rbx
230
231 ## if bit 32+3 is set, then all lanes are empty
232 mov _unused_lanes(state), unused_lanes
233 bt $(32+3), unused_lanes
234 jc .return_null
235
236 # Find min length
237 vmovdqa _lens(state), %xmm0
238 vmovdqa _lens+1*16(state), %xmm1
239
240 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
241 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
242 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
243 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
244 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
245
246 vmovd %xmm2, DWORD_idx
247 test $~0xF, idx
248 jnz .return_null
249
250 # process completed job "idx"
251 imul $_LANE_DATA_size, idx, lane_data
252 lea _ldata(state, lane_data), lane_data
253
254 mov _job_in_lane(lane_data), job_rax
255 movq $0, _job_in_lane(lane_data)
256 movl $STS_COMPLETED, _status(job_rax)
257 mov _unused_lanes(state), unused_lanes
258 shl $4, unused_lanes
259 or idx, unused_lanes
260 mov unused_lanes, _unused_lanes(state)
261
262 movl $0xFFFFFFFF, _lens(state, idx, 4)
263
264 vmovd _args_digest(state, idx, 4), %xmm0
265 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
266 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
267 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
268 movl _args_digest+4*32(state, idx, 4), tmp2_w
269 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
270 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
271 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
272
273 vmovdqu %xmm0, _result_digest(job_rax)
274 movl tmp2_w, _result_digest+1*16(job_rax)
275
276 pop %rbx
277
278 ret
279
280.return_null:
281 xor job_rax, job_rax
282 pop %rbx
283 ret
284ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
285
286.data
287
288.align 16
289clear_low_nibble:
290.octa 0x000000000000000000000000FFFFFFF0
291one:
292.quad 1
293two:
294.quad 2
295three:
296.quad 3
297four:
298.quad 4
299five:
300.quad 5
301six:
302.quad 6
303seven:
304.quad 7
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..b0c498371e67
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,65 @@
1/*
2 * Initialization code for multi buffer SHA256 algorithm for AVX2
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2016 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Megha Dey <megha.dey@linux.intel.com>
22 *
23 * BSD LICENSE
24 *
25 * Copyright(c) 2016 Intel Corporation.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 *
31 * * Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * * Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in
35 * the documentation and/or other materials provided with the
36 * distribution.
37 * * Neither the name of Intel Corporation nor the names of its
38 * contributors may be used to endorse or promote products derived
39 * from this software without specific prior written permission.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 */
53
54#include "sha256_mb_mgr.h"
55
56void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
57{
58 unsigned int j;
59
60 state->unused_lanes = 0xF76543210ULL;
61 for (j = 0; j < 8; j++) {
62 state->lens[j] = 0xFFFFFFFF;
63 state->ldata[j].job_in_lane = NULL;
64 }
65}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..7ea670e25acc
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -0,0 +1,215 @@
1/*
2 * Buffer submit code for multi buffer SHA256 algorithm
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2016 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Megha Dey <megha.dey@linux.intel.com>
22 *
23 * BSD LICENSE
24 *
25 * Copyright(c) 2016 Intel Corporation.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 *
31 * * Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * * Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in
35 * the documentation and/or other materials provided with the
36 * distribution.
37 * * Neither the name of Intel Corporation nor the names of its
38 * contributors may be used to endorse or promote products derived
39 * from this software without specific prior written permission.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 */
53
54#include <linux/linkage.h>
55#include <asm/frame.h>
56#include "sha256_mb_mgr_datastruct.S"
57
58.extern sha256_x8_avx2
59
60# LINUX register definitions
61arg1 = %rdi
62arg2 = %rsi
63size_offset = %rcx
64tmp2 = %rcx
65extra_blocks = %rdx
66
67# Common definitions
68#define state arg1
69#define job %rsi
70#define len2 arg2
71#define p2 arg2
72
73# idx must be a register not clobberred by sha1_x8_avx2
74idx = %r8
75DWORD_idx = %r8d
76last_len = %r8
77
78p = %r11
79start_offset = %r11
80
81unused_lanes = %rbx
82BYTE_unused_lanes = %bl
83
84job_rax = %rax
85len = %rax
86DWORD_len = %eax
87
88lane = %r12
89tmp3 = %r12
90
91tmp = %r9
92DWORD_tmp = %r9d
93
94lane_data = %r10
95
96# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
97# arg 1 : rcx : state
98# arg 2 : rdx : job
99ENTRY(sha256_mb_mgr_submit_avx2)
100 FRAME_BEGIN
101 push %rbx
102 push %r12
103
104 mov _unused_lanes(state), unused_lanes
105 mov unused_lanes, lane
106 and $0xF, lane
107 shr $4, unused_lanes
108 imul $_LANE_DATA_size, lane, lane_data
109 movl $STS_BEING_PROCESSED, _status(job)
110 lea _ldata(state, lane_data), lane_data
111 mov unused_lanes, _unused_lanes(state)
112 movl _len(job), DWORD_len
113
114 mov job, _job_in_lane(lane_data)
115 shl $4, len
116 or lane, len
117
118 movl DWORD_len, _lens(state , lane, 4)
119
120 # Load digest words from result_digest
121 vmovdqu _result_digest(job), %xmm0
122 vmovdqu _result_digest+1*16(job), %xmm1
123 vmovd %xmm0, _args_digest(state, lane, 4)
124 vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
125 vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
126 vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
127 vmovd %xmm1, _args_digest+4*32(state , lane, 4)
128
129 vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
130 vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
131 vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
132
133 mov _buffer(job), p
134 mov p, _args_data_ptr(state, lane, 8)
135
136 cmp $0xF, unused_lanes
137 jne return_null
138
139start_loop:
140 # Find min length
141 vmovdqa _lens(state), %xmm0
142 vmovdqa _lens+1*16(state), %xmm1
143
144 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
145 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
146 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
147 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
148 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
149
150 vmovd %xmm2, DWORD_idx
151 mov idx, len2
152 and $0xF, idx
153 shr $4, len2
154 jz len_is_0
155
156 vpand clear_low_nibble(%rip), %xmm2, %xmm2
157 vpshufd $0, %xmm2, %xmm2
158
159 vpsubd %xmm2, %xmm0, %xmm0
160 vpsubd %xmm2, %xmm1, %xmm1
161
162 vmovdqa %xmm0, _lens + 0*16(state)
163 vmovdqa %xmm1, _lens + 1*16(state)
164
165 # "state" and "args" are the same address, arg1
166 # len is arg2
167 call sha256_x8_avx2
168
169 # state and idx are intact
170
171len_is_0:
172 # process completed job "idx"
173 imul $_LANE_DATA_size, idx, lane_data
174 lea _ldata(state, lane_data), lane_data
175
176 mov _job_in_lane(lane_data), job_rax
177 mov _unused_lanes(state), unused_lanes
178 movq $0, _job_in_lane(lane_data)
179 movl $STS_COMPLETED, _status(job_rax)
180 shl $4, unused_lanes
181 or idx, unused_lanes
182 mov unused_lanes, _unused_lanes(state)
183
184 movl $0xFFFFFFFF, _lens(state,idx,4)
185
186 vmovd _args_digest(state, idx, 4), %xmm0
187 vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
188 vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
189 vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
190 vmovd _args_digest+4*32(state, idx, 4), %xmm1
191
192 vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
193 vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
194 vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
195
196 vmovdqu %xmm0, _result_digest(job_rax)
197 vmovdqu %xmm1, _result_digest+1*16(job_rax)
198
199return:
200 pop %r12
201 pop %rbx
202 FRAME_END
203 ret
204
205return_null:
206 xor job_rax, job_rax
207 jmp return
208
209ENDPROC(sha256_mb_mgr_submit_avx2)
210
211.data
212
213.align 16
214clear_low_nibble:
215 .octa 0x000000000000000000000000FFFFFFF0