diff options
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx-asm_64.S | 180 | ||||
-rw-r--r-- | arch/x86/crypto/camellia_aesni_avx_glue.c | 91 |
2 files changed, 229 insertions, 42 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index cfc163469c71..ce71f9212409 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * x86_64/AVX/AES-NI assembler implementation of Camellia | 2 | * x86_64/AVX/AES-NI assembler implementation of Camellia |
3 | * | 3 | * |
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
589 | .Lbswap128_mask: | 589 | .Lbswap128_mask: |
590 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 590 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
591 | 591 | ||
592 | /* For XTS mode IV generation */ | ||
593 | .Lxts_gf128mul_and_shl1_mask: | ||
594 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
595 | |||
592 | /* | 596 | /* |
593 | * pre-SubByte transform | 597 | * pre-SubByte transform |
594 | * | 598 | * |
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way) | |||
1090 | 1094 | ||
1091 | ret; | 1095 | ret; |
1092 | ENDPROC(camellia_ctr_16way) | 1096 | ENDPROC(camellia_ctr_16way) |
1097 | |||
1098 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
1099 | vpsrad $31, iv, tmp; \ | ||
1100 | vpaddq iv, iv, iv; \ | ||
1101 | vpshufd $0x13, tmp, tmp; \ | ||
1102 | vpand mask, tmp, tmp; \ | ||
1103 | vpxor tmp, iv, iv; | ||
1104 | |||
1105 | .align 8 | ||
1106 | camellia_xts_crypt_16way: | ||
1107 | /* input: | ||
1108 | * %rdi: ctx, CTX | ||
1109 | * %rsi: dst (16 blocks) | ||
1110 | * %rdx: src (16 blocks) | ||
1111 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1112 | * %r8: index for input whitening key | ||
1113 | * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 | ||
1114 | */ | ||
1115 | |||
1116 | subq $(16 * 16), %rsp; | ||
1117 | movq %rsp, %rax; | ||
1118 | |||
1119 | vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; | ||
1120 | |||
1121 | /* load IV */ | ||
1122 | vmovdqu (%rcx), %xmm0; | ||
1123 | vpxor 0 * 16(%rdx), %xmm0, %xmm15; | ||
1124 | vmovdqu %xmm15, 15 * 16(%rax); | ||
1125 | vmovdqu %xmm0, 0 * 16(%rsi); | ||
1126 | |||
1127 | /* construct IVs */ | ||
1128 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1129 | vpxor 1 * 16(%rdx), %xmm0, %xmm15; | ||
1130 | vmovdqu %xmm15, 14 * 16(%rax); | ||
1131 | vmovdqu %xmm0, 1 * 16(%rsi); | ||
1132 | |||
1133 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1134 | vpxor 2 * 16(%rdx), %xmm0, %xmm13; | ||
1135 | vmovdqu %xmm0, 2 * 16(%rsi); | ||
1136 | |||
1137 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1138 | vpxor 3 * 16(%rdx), %xmm0, %xmm12; | ||
1139 | vmovdqu %xmm0, 3 * 16(%rsi); | ||
1140 | |||
1141 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1142 | vpxor 4 * 16(%rdx), %xmm0, %xmm11; | ||
1143 | vmovdqu %xmm0, 4 * 16(%rsi); | ||
1144 | |||
1145 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1146 | vpxor 5 * 16(%rdx), %xmm0, %xmm10; | ||
1147 | vmovdqu %xmm0, 5 * 16(%rsi); | ||
1148 | |||
1149 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1150 | vpxor 6 * 16(%rdx), %xmm0, %xmm9; | ||
1151 | vmovdqu %xmm0, 6 * 16(%rsi); | ||
1152 | |||
1153 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1154 | vpxor 7 * 16(%rdx), %xmm0, %xmm8; | ||
1155 | vmovdqu %xmm0, 7 * 16(%rsi); | ||
1156 | |||
1157 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1158 | vpxor 8 * 16(%rdx), %xmm0, %xmm7; | ||
1159 | vmovdqu %xmm0, 8 * 16(%rsi); | ||
1160 | |||
1161 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1162 | vpxor 9 * 16(%rdx), %xmm0, %xmm6; | ||
1163 | vmovdqu %xmm0, 9 * 16(%rsi); | ||
1164 | |||
1165 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1166 | vpxor 10 * 16(%rdx), %xmm0, %xmm5; | ||
1167 | vmovdqu %xmm0, 10 * 16(%rsi); | ||
1168 | |||
1169 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1170 | vpxor 11 * 16(%rdx), %xmm0, %xmm4; | ||
1171 | vmovdqu %xmm0, 11 * 16(%rsi); | ||
1172 | |||
1173 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1174 | vpxor 12 * 16(%rdx), %xmm0, %xmm3; | ||
1175 | vmovdqu %xmm0, 12 * 16(%rsi); | ||
1176 | |||
1177 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1178 | vpxor 13 * 16(%rdx), %xmm0, %xmm2; | ||
1179 | vmovdqu %xmm0, 13 * 16(%rsi); | ||
1180 | |||
1181 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1182 | vpxor 14 * 16(%rdx), %xmm0, %xmm1; | ||
1183 | vmovdqu %xmm0, 14 * 16(%rsi); | ||
1184 | |||
1185 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1186 | vpxor 15 * 16(%rdx), %xmm0, %xmm15; | ||
1187 | vmovdqu %xmm15, 0 * 16(%rax); | ||
1188 | vmovdqu %xmm0, 15 * 16(%rsi); | ||
1189 | |||
1190 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1191 | vmovdqu %xmm0, (%rcx); | ||
1192 | |||
1193 | /* inpack16_pre: */ | ||
1194 | vmovq (key_table)(CTX, %r8, 8), %xmm15; | ||
1195 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | ||
1196 | vpxor 0 * 16(%rax), %xmm15, %xmm0; | ||
1197 | vpxor %xmm1, %xmm15, %xmm1; | ||
1198 | vpxor %xmm2, %xmm15, %xmm2; | ||
1199 | vpxor %xmm3, %xmm15, %xmm3; | ||
1200 | vpxor %xmm4, %xmm15, %xmm4; | ||
1201 | vpxor %xmm5, %xmm15, %xmm5; | ||
1202 | vpxor %xmm6, %xmm15, %xmm6; | ||
1203 | vpxor %xmm7, %xmm15, %xmm7; | ||
1204 | vpxor %xmm8, %xmm15, %xmm8; | ||
1205 | vpxor %xmm9, %xmm15, %xmm9; | ||
1206 | vpxor %xmm10, %xmm15, %xmm10; | ||
1207 | vpxor %xmm11, %xmm15, %xmm11; | ||
1208 | vpxor %xmm12, %xmm15, %xmm12; | ||
1209 | vpxor %xmm13, %xmm15, %xmm13; | ||
1210 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | ||
1211 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | ||
1212 | |||
1213 | call *%r9; | ||
1214 | |||
1215 | addq $(16 * 16), %rsp; | ||
1216 | |||
1217 | vpxor 0 * 16(%rsi), %xmm7, %xmm7; | ||
1218 | vpxor 1 * 16(%rsi), %xmm6, %xmm6; | ||
1219 | vpxor 2 * 16(%rsi), %xmm5, %xmm5; | ||
1220 | vpxor 3 * 16(%rsi), %xmm4, %xmm4; | ||
1221 | vpxor 4 * 16(%rsi), %xmm3, %xmm3; | ||
1222 | vpxor 5 * 16(%rsi), %xmm2, %xmm2; | ||
1223 | vpxor 6 * 16(%rsi), %xmm1, %xmm1; | ||
1224 | vpxor 7 * 16(%rsi), %xmm0, %xmm0; | ||
1225 | vpxor 8 * 16(%rsi), %xmm15, %xmm15; | ||
1226 | vpxor 9 * 16(%rsi), %xmm14, %xmm14; | ||
1227 | vpxor 10 * 16(%rsi), %xmm13, %xmm13; | ||
1228 | vpxor 11 * 16(%rsi), %xmm12, %xmm12; | ||
1229 | vpxor 12 * 16(%rsi), %xmm11, %xmm11; | ||
1230 | vpxor 13 * 16(%rsi), %xmm10, %xmm10; | ||
1231 | vpxor 14 * 16(%rsi), %xmm9, %xmm9; | ||
1232 | vpxor 15 * 16(%rsi), %xmm8, %xmm8; | ||
1233 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
1234 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
1235 | %xmm8, %rsi); | ||
1236 | |||
1237 | ret; | ||
1238 | ENDPROC(camellia_xts_crypt_16way) | ||
1239 | |||
1240 | ENTRY(camellia_xts_enc_16way) | ||
1241 | /* input: | ||
1242 | * %rdi: ctx, CTX | ||
1243 | * %rsi: dst (16 blocks) | ||
1244 | * %rdx: src (16 blocks) | ||
1245 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1246 | */ | ||
1247 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | ||
1248 | |||
1249 | leaq __camellia_enc_blk16, %r9; | ||
1250 | |||
1251 | jmp camellia_xts_crypt_16way; | ||
1252 | ENDPROC(camellia_xts_enc_16way) | ||
1253 | |||
1254 | ENTRY(camellia_xts_dec_16way) | ||
1255 | /* input: | ||
1256 | * %rdi: ctx, CTX | ||
1257 | * %rsi: dst (16 blocks) | ||
1258 | * %rdx: src (16 blocks) | ||
1259 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1260 | */ | ||
1261 | |||
1262 | cmpl $16, key_length(CTX); | ||
1263 | movl $32, %r8d; | ||
1264 | movl $24, %eax; | ||
1265 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | ||
1266 | |||
1267 | leaq __camellia_dec_blk16, %r9; | ||
1268 | |||
1269 | jmp camellia_xts_crypt_16way; | ||
1270 | ENDPROC(camellia_xts_dec_16way) | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 96cbb6068fce..4ff7ed47b3db 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia | 2 | * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia |
3 | * | 3 | * |
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -37,6 +37,23 @@ asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | |||
37 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | 37 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, |
38 | const u8 *src, le128 *iv); | 38 | const u8 *src, le128 *iv); |
39 | 39 | ||
40 | asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
41 | const u8 *src, le128 *iv); | ||
42 | asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
43 | const u8 *src, le128 *iv); | ||
44 | |||
45 | static void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
46 | { | ||
47 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
48 | GLUE_FUNC_CAST(camellia_enc_blk)); | ||
49 | } | ||
50 | |||
51 | static void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
52 | { | ||
53 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
54 | GLUE_FUNC_CAST(camellia_dec_blk)); | ||
55 | } | ||
56 | |||
40 | static const struct common_glue_ctx camellia_enc = { | 57 | static const struct common_glue_ctx camellia_enc = { |
41 | .num_funcs = 3, | 58 | .num_funcs = 3, |
42 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | 59 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, |
@@ -69,6 +86,19 @@ static const struct common_glue_ctx camellia_ctr = { | |||
69 | } } | 86 | } } |
70 | }; | 87 | }; |
71 | 88 | ||
89 | static const struct common_glue_ctx camellia_enc_xts = { | ||
90 | .num_funcs = 2, | ||
91 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
92 | |||
93 | .funcs = { { | ||
94 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
95 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } | ||
96 | }, { | ||
97 | .num_blocks = 1, | ||
98 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } | ||
99 | } } | ||
100 | }; | ||
101 | |||
72 | static const struct common_glue_ctx camellia_dec = { | 102 | static const struct common_glue_ctx camellia_dec = { |
73 | .num_funcs = 3, | 103 | .num_funcs = 3, |
74 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | 104 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, |
@@ -101,6 +131,19 @@ static const struct common_glue_ctx camellia_dec_cbc = { | |||
101 | } } | 131 | } } |
102 | }; | 132 | }; |
103 | 133 | ||
134 | static const struct common_glue_ctx camellia_dec_xts = { | ||
135 | .num_funcs = 2, | ||
136 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
137 | |||
138 | .funcs = { { | ||
139 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
140 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } | ||
141 | }, { | ||
142 | .num_blocks = 1, | ||
143 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } | ||
144 | } } | ||
145 | }; | ||
146 | |||
104 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 147 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
105 | struct scatterlist *src, unsigned int nbytes) | 148 | struct scatterlist *src, unsigned int nbytes) |
106 | { | 149 | { |
@@ -261,54 +304,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
261 | struct scatterlist *src, unsigned int nbytes) | 304 | struct scatterlist *src, unsigned int nbytes) |
262 | { | 305 | { |
263 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 306 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
264 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
265 | struct crypt_priv crypt_ctx = { | ||
266 | .ctx = &ctx->crypt_ctx, | ||
267 | .fpu_enabled = false, | ||
268 | }; | ||
269 | struct xts_crypt_req req = { | ||
270 | .tbuf = buf, | ||
271 | .tbuflen = sizeof(buf), | ||
272 | |||
273 | .tweak_ctx = &ctx->tweak_ctx, | ||
274 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | ||
275 | .crypt_ctx = &crypt_ctx, | ||
276 | .crypt_fn = encrypt_callback, | ||
277 | }; | ||
278 | int ret; | ||
279 | |||
280 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
281 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
282 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
283 | 307 | ||
284 | return ret; | 308 | return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, |
309 | XTS_TWEAK_CAST(camellia_enc_blk), | ||
310 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
285 | } | 311 | } |
286 | 312 | ||
287 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 313 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
288 | struct scatterlist *src, unsigned int nbytes) | 314 | struct scatterlist *src, unsigned int nbytes) |
289 | { | 315 | { |
290 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 316 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
291 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
292 | struct crypt_priv crypt_ctx = { | ||
293 | .ctx = &ctx->crypt_ctx, | ||
294 | .fpu_enabled = false, | ||
295 | }; | ||
296 | struct xts_crypt_req req = { | ||
297 | .tbuf = buf, | ||
298 | .tbuflen = sizeof(buf), | ||
299 | 317 | ||
300 | .tweak_ctx = &ctx->tweak_ctx, | 318 | return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, |
301 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | 319 | XTS_TWEAK_CAST(camellia_enc_blk), |
302 | .crypt_ctx = &crypt_ctx, | 320 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
303 | .crypt_fn = decrypt_callback, | ||
304 | }; | ||
305 | int ret; | ||
306 | |||
307 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
308 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
309 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
310 | |||
311 | return ret; | ||
312 | } | 321 | } |
313 | 322 | ||
314 | static struct crypto_alg cmll_algs[10] = { { | 323 | static struct crypto_alg cmll_algs[10] = { { |