aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-04-08 14:51:11 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2013-04-25 09:01:52 -0400
commitb5c5b072dc2f35d45d3404b957e264a3e8e71069 (patch)
tree55c726bf708c54cb50d3f564b39f6e5bac0b96bf
parent70177286e1d49dfa2ce565af10d1f63d9b769d77 (diff)
crypto: x86/camellia-aesni-avx - add more optimized XTS code
Add more optimized XTS code for camellia-aesni-avx, for smaller stack usage and small boost for speed. tcrypt results, with Intel i5-2450M: enc dec 16B 1.10x 1.01x 64B 0.82x 0.77x 256B 1.14x 1.10x 1024B 1.17x 1.16x 8192B 1.10x 1.11x Since XTS is practically always used with data blocks of size 512 bytes or more, I chose to not make use of camellia-2way for block sized smaller than 256 bytes. This causes slower result in tcrypt for 64 bytes. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S180
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c91
2 files changed, 229 insertions, 42 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index cfc163469c71..ce71f9212409 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -1,7 +1,7 @@
1/* 1/*
2 * x86_64/AVX/AES-NI assembler implementation of Camellia 2 * x86_64/AVX/AES-NI assembler implementation of Camellia
3 * 3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
589.Lbswap128_mask: 589.Lbswap128_mask:
590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
591 591
592/* For XTS mode IV generation */
593.Lxts_gf128mul_and_shl1_mask:
594 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
595
592/* 596/*
593 * pre-SubByte transform 597 * pre-SubByte transform
594 * 598 *
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
1090 1094
1091 ret; 1095 ret;
1092ENDPROC(camellia_ctr_16way) 1096ENDPROC(camellia_ctr_16way)
1097
1098#define gf128mul_x_ble(iv, mask, tmp) \
1099 vpsrad $31, iv, tmp; \
1100 vpaddq iv, iv, iv; \
1101 vpshufd $0x13, tmp, tmp; \
1102 vpand mask, tmp, tmp; \
1103 vpxor tmp, iv, iv;
1104
1105.align 8
1106camellia_xts_crypt_16way:
1107 /* input:
1108 * %rdi: ctx, CTX
1109 * %rsi: dst (16 blocks)
1110 * %rdx: src (16 blocks)
1111 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1112 * %r8: index for input whitening key
1113 * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
1114 */
1115
1116 subq $(16 * 16), %rsp;
1117 movq %rsp, %rax;
1118
1119 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1120
1121 /* load IV */
1122 vmovdqu (%rcx), %xmm0;
1123 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1124 vmovdqu %xmm15, 15 * 16(%rax);
1125 vmovdqu %xmm0, 0 * 16(%rsi);
1126
1127 /* construct IVs */
1128 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1129 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1130 vmovdqu %xmm15, 14 * 16(%rax);
1131 vmovdqu %xmm0, 1 * 16(%rsi);
1132
1133 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1134 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1135 vmovdqu %xmm0, 2 * 16(%rsi);
1136
1137 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1138 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1139 vmovdqu %xmm0, 3 * 16(%rsi);
1140
1141 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1142 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1143 vmovdqu %xmm0, 4 * 16(%rsi);
1144
1145 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1146 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1147 vmovdqu %xmm0, 5 * 16(%rsi);
1148
1149 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1150 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1151 vmovdqu %xmm0, 6 * 16(%rsi);
1152
1153 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1154 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1155 vmovdqu %xmm0, 7 * 16(%rsi);
1156
1157 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1158 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1159 vmovdqu %xmm0, 8 * 16(%rsi);
1160
1161 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1162 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1163 vmovdqu %xmm0, 9 * 16(%rsi);
1164
1165 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1166 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1167 vmovdqu %xmm0, 10 * 16(%rsi);
1168
1169 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1170 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1171 vmovdqu %xmm0, 11 * 16(%rsi);
1172
1173 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1174 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1175 vmovdqu %xmm0, 12 * 16(%rsi);
1176
1177 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1178 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1179 vmovdqu %xmm0, 13 * 16(%rsi);
1180
1181 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1182 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1183 vmovdqu %xmm0, 14 * 16(%rsi);
1184
1185 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1186 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1187 vmovdqu %xmm15, 0 * 16(%rax);
1188 vmovdqu %xmm0, 15 * 16(%rsi);
1189
1190 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1191 vmovdqu %xmm0, (%rcx);
1192
1193 /* inpack16_pre: */
1194 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1195 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1196 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1197 vpxor %xmm1, %xmm15, %xmm1;
1198 vpxor %xmm2, %xmm15, %xmm2;
1199 vpxor %xmm3, %xmm15, %xmm3;
1200 vpxor %xmm4, %xmm15, %xmm4;
1201 vpxor %xmm5, %xmm15, %xmm5;
1202 vpxor %xmm6, %xmm15, %xmm6;
1203 vpxor %xmm7, %xmm15, %xmm7;
1204 vpxor %xmm8, %xmm15, %xmm8;
1205 vpxor %xmm9, %xmm15, %xmm9;
1206 vpxor %xmm10, %xmm15, %xmm10;
1207 vpxor %xmm11, %xmm15, %xmm11;
1208 vpxor %xmm12, %xmm15, %xmm12;
1209 vpxor %xmm13, %xmm15, %xmm13;
1210 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1211 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1212
1213 call *%r9;
1214
1215 addq $(16 * 16), %rsp;
1216
1217 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1218 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1219 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1220 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1221 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1222 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1223 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1224 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1225 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1226 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1227 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1228 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1229 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1230 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1231 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1232 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1233 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1234 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1235 %xmm8, %rsi);
1236
1237 ret;
1238ENDPROC(camellia_xts_crypt_16way)
1239
1240ENTRY(camellia_xts_enc_16way)
1241 /* input:
1242 * %rdi: ctx, CTX
1243 * %rsi: dst (16 blocks)
1244 * %rdx: src (16 blocks)
1245 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1246 */
1247 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1248
1249 leaq __camellia_enc_blk16, %r9;
1250
1251 jmp camellia_xts_crypt_16way;
1252ENDPROC(camellia_xts_enc_16way)
1253
1254ENTRY(camellia_xts_dec_16way)
1255 /* input:
1256 * %rdi: ctx, CTX
1257 * %rsi: dst (16 blocks)
1258 * %rdx: src (16 blocks)
1259 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1260 */
1261
1262 cmpl $16, key_length(CTX);
1263 movl $32, %r8d;
1264 movl $24, %eax;
1265 cmovel %eax, %r8d; /* input whitening key, last for dec */
1266
1267 leaq __camellia_dec_blk16, %r9;
1268
1269 jmp camellia_xts_crypt_16way;
1270ENDPROC(camellia_xts_dec_16way)
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 96cbb6068fce..4ff7ed47b3db 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia 2 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
3 * 3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -37,6 +37,23 @@ asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
37asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, 37asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
38 const u8 *src, le128 *iv); 38 const u8 *src, le128 *iv);
39 39
40asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
41 const u8 *src, le128 *iv);
42asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
43 const u8 *src, le128 *iv);
44
45static void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
46{
47 glue_xts_crypt_128bit_one(ctx, dst, src, iv,
48 GLUE_FUNC_CAST(camellia_enc_blk));
49}
50
51static void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
52{
53 glue_xts_crypt_128bit_one(ctx, dst, src, iv,
54 GLUE_FUNC_CAST(camellia_dec_blk));
55}
56
40static const struct common_glue_ctx camellia_enc = { 57static const struct common_glue_ctx camellia_enc = {
41 .num_funcs = 3, 58 .num_funcs = 3,
42 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 59 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
@@ -69,6 +86,19 @@ static const struct common_glue_ctx camellia_ctr = {
69 } } 86 } }
70}; 87};
71 88
89static const struct common_glue_ctx camellia_enc_xts = {
90 .num_funcs = 2,
91 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
92
93 .funcs = { {
94 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
95 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
96 }, {
97 .num_blocks = 1,
98 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
99 } }
100};
101
72static const struct common_glue_ctx camellia_dec = { 102static const struct common_glue_ctx camellia_dec = {
73 .num_funcs = 3, 103 .num_funcs = 3,
74 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 104 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
@@ -101,6 +131,19 @@ static const struct common_glue_ctx camellia_dec_cbc = {
101 } } 131 } }
102}; 132};
103 133
134static const struct common_glue_ctx camellia_dec_xts = {
135 .num_funcs = 2,
136 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
137
138 .funcs = { {
139 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
140 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
141 }, {
142 .num_blocks = 1,
143 .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
144 } }
145};
146
104static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 147static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
105 struct scatterlist *src, unsigned int nbytes) 148 struct scatterlist *src, unsigned int nbytes)
106{ 149{
@@ -261,54 +304,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
261 struct scatterlist *src, unsigned int nbytes) 304 struct scatterlist *src, unsigned int nbytes)
262{ 305{
263 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 306 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
264 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
265 struct crypt_priv crypt_ctx = {
266 .ctx = &ctx->crypt_ctx,
267 .fpu_enabled = false,
268 };
269 struct xts_crypt_req req = {
270 .tbuf = buf,
271 .tbuflen = sizeof(buf),
272
273 .tweak_ctx = &ctx->tweak_ctx,
274 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
275 .crypt_ctx = &crypt_ctx,
276 .crypt_fn = encrypt_callback,
277 };
278 int ret;
279
280 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
281 ret = xts_crypt(desc, dst, src, nbytes, &req);
282 camellia_fpu_end(crypt_ctx.fpu_enabled);
283 307
284 return ret; 308 return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
309 XTS_TWEAK_CAST(camellia_enc_blk),
310 &ctx->tweak_ctx, &ctx->crypt_ctx);
285} 311}
286 312
287static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 313static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
288 struct scatterlist *src, unsigned int nbytes) 314 struct scatterlist *src, unsigned int nbytes)
289{ 315{
290 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 316 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
291 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
292 struct crypt_priv crypt_ctx = {
293 .ctx = &ctx->crypt_ctx,
294 .fpu_enabled = false,
295 };
296 struct xts_crypt_req req = {
297 .tbuf = buf,
298 .tbuflen = sizeof(buf),
299 317
300 .tweak_ctx = &ctx->tweak_ctx, 318 return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
301 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), 319 XTS_TWEAK_CAST(camellia_enc_blk),
302 .crypt_ctx = &crypt_ctx, 320 &ctx->tweak_ctx, &ctx->crypt_ctx);
303 .crypt_fn = decrypt_callback,
304 };
305 int ret;
306
307 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
308 ret = xts_crypt(desc, dst, src, nbytes, &req);
309 camellia_fpu_end(crypt_ctx.fpu_enabled);
310
311 return ret;
312} 321}
313 322
314static struct crypto_alg cmll_algs[10] = { { 323static struct crypto_alg cmll_algs[10] = { {