aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/camellia-aesni-avx-asm_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto/camellia-aesni-avx-asm_64.S')
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S180
1 files changed, 179 insertions, 1 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index cfc163469c71..ce71f9212409 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -1,7 +1,7 @@
1/* 1/*
2 * x86_64/AVX/AES-NI assembler implementation of Camellia 2 * x86_64/AVX/AES-NI assembler implementation of Camellia
3 * 3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
589.Lbswap128_mask: 589.Lbswap128_mask:
590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
591 591
592/* For XTS mode IV generation */
593.Lxts_gf128mul_and_shl1_mask:
594 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
595
592/* 596/*
593 * pre-SubByte transform 597 * pre-SubByte transform
594 * 598 *
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
1090 1094
1091 ret; 1095 ret;
1092ENDPROC(camellia_ctr_16way) 1096ENDPROC(camellia_ctr_16way)
1097
1098#define gf128mul_x_ble(iv, mask, tmp) \
1099 vpsrad $31, iv, tmp; \
1100 vpaddq iv, iv, iv; \
1101 vpshufd $0x13, tmp, tmp; \
1102 vpand mask, tmp, tmp; \
1103 vpxor tmp, iv, iv;
1104
1105.align 8
1106camellia_xts_crypt_16way:
1107 /* input:
1108 * %rdi: ctx, CTX
1109 * %rsi: dst (16 blocks)
1110 * %rdx: src (16 blocks)
1111 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1112 * %r8: index for input whitening key
1113 * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
1114 */
1115
1116 subq $(16 * 16), %rsp;
1117 movq %rsp, %rax;
1118
1119 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1120
1121 /* load IV */
1122 vmovdqu (%rcx), %xmm0;
1123 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1124 vmovdqu %xmm15, 15 * 16(%rax);
1125 vmovdqu %xmm0, 0 * 16(%rsi);
1126
1127 /* construct IVs */
1128 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1129 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1130 vmovdqu %xmm15, 14 * 16(%rax);
1131 vmovdqu %xmm0, 1 * 16(%rsi);
1132
1133 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1134 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1135 vmovdqu %xmm0, 2 * 16(%rsi);
1136
1137 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1138 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1139 vmovdqu %xmm0, 3 * 16(%rsi);
1140
1141 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1142 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1143 vmovdqu %xmm0, 4 * 16(%rsi);
1144
1145 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1146 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1147 vmovdqu %xmm0, 5 * 16(%rsi);
1148
1149 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1150 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1151 vmovdqu %xmm0, 6 * 16(%rsi);
1152
1153 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1154 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1155 vmovdqu %xmm0, 7 * 16(%rsi);
1156
1157 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1158 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1159 vmovdqu %xmm0, 8 * 16(%rsi);
1160
1161 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1162 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1163 vmovdqu %xmm0, 9 * 16(%rsi);
1164
1165 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1166 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1167 vmovdqu %xmm0, 10 * 16(%rsi);
1168
1169 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1170 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1171 vmovdqu %xmm0, 11 * 16(%rsi);
1172
1173 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1174 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1175 vmovdqu %xmm0, 12 * 16(%rsi);
1176
1177 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1178 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1179 vmovdqu %xmm0, 13 * 16(%rsi);
1180
1181 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1182 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1183 vmovdqu %xmm0, 14 * 16(%rsi);
1184
1185 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1186 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1187 vmovdqu %xmm15, 0 * 16(%rax);
1188 vmovdqu %xmm0, 15 * 16(%rsi);
1189
1190 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1191 vmovdqu %xmm0, (%rcx);
1192
1193 /* inpack16_pre: */
1194 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1195 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1196 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1197 vpxor %xmm1, %xmm15, %xmm1;
1198 vpxor %xmm2, %xmm15, %xmm2;
1199 vpxor %xmm3, %xmm15, %xmm3;
1200 vpxor %xmm4, %xmm15, %xmm4;
1201 vpxor %xmm5, %xmm15, %xmm5;
1202 vpxor %xmm6, %xmm15, %xmm6;
1203 vpxor %xmm7, %xmm15, %xmm7;
1204 vpxor %xmm8, %xmm15, %xmm8;
1205 vpxor %xmm9, %xmm15, %xmm9;
1206 vpxor %xmm10, %xmm15, %xmm10;
1207 vpxor %xmm11, %xmm15, %xmm11;
1208 vpxor %xmm12, %xmm15, %xmm12;
1209 vpxor %xmm13, %xmm15, %xmm13;
1210 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1211 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1212
1213 call *%r9;
1214
1215 addq $(16 * 16), %rsp;
1216
1217 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1218 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1219 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1220 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1221 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1222 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1223 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1224 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1225 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1226 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1227 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1228 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1229 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1230 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1231 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1232 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1233 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1234 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1235 %xmm8, %rsi);
1236
1237 ret;
1238ENDPROC(camellia_xts_crypt_16way)
1239
1240ENTRY(camellia_xts_enc_16way)
1241 /* input:
1242 * %rdi: ctx, CTX
1243 * %rsi: dst (16 blocks)
1244 * %rdx: src (16 blocks)
1245 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1246 */
1247 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1248
1249 leaq __camellia_enc_blk16, %r9;
1250
1251 jmp camellia_xts_crypt_16way;
1252ENDPROC(camellia_xts_enc_16way)
1253
1254ENTRY(camellia_xts_dec_16way)
1255 /* input:
1256 * %rdi: ctx, CTX
1257 * %rsi: dst (16 blocks)
1258 * %rdx: src (16 blocks)
1259 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1260 */
1261
1262 cmpl $16, key_length(CTX);
1263 movl $32, %r8d;
1264 movl $24, %eax;
1265 cmovel %eax, %r8d; /* input whitening key, last for dec */
1266
1267 leaq __camellia_dec_blk16, %r9;
1268
1269 jmp camellia_xts_crypt_16way;
1270ENDPROC(camellia_xts_dec_16way)