diff options
Diffstat (limited to 'arch/x86/crypto/camellia-aesni-avx-asm_64.S')
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx-asm_64.S | 180 |
1 files changed, 179 insertions, 1 deletions
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index cfc163469c71..ce71f9212409 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * x86_64/AVX/AES-NI assembler implementation of Camellia | 2 | * x86_64/AVX/AES-NI assembler implementation of Camellia |
3 | * | 3 | * |
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
589 | .Lbswap128_mask: | 589 | .Lbswap128_mask: |
590 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 590 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
591 | 591 | ||
592 | /* For XTS mode IV generation */ | ||
593 | .Lxts_gf128mul_and_shl1_mask: | ||
594 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
595 | |||
592 | /* | 596 | /* |
593 | * pre-SubByte transform | 597 | * pre-SubByte transform |
594 | * | 598 | * |
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way) | |||
1090 | 1094 | ||
1091 | ret; | 1095 | ret; |
1092 | ENDPROC(camellia_ctr_16way) | 1096 | ENDPROC(camellia_ctr_16way) |
1097 | |||
1098 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
1099 | vpsrad $31, iv, tmp; \ | ||
1100 | vpaddq iv, iv, iv; \ | ||
1101 | vpshufd $0x13, tmp, tmp; \ | ||
1102 | vpand mask, tmp, tmp; \ | ||
1103 | vpxor tmp, iv, iv; | ||
1104 | |||
1105 | .align 8 | ||
1106 | camellia_xts_crypt_16way: | ||
1107 | /* input: | ||
1108 | * %rdi: ctx, CTX | ||
1109 | * %rsi: dst (16 blocks) | ||
1110 | * %rdx: src (16 blocks) | ||
1111 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1112 | * %r8: index for input whitening key | ||
1113 | * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 | ||
1114 | */ | ||
1115 | |||
1116 | subq $(16 * 16), %rsp; | ||
1117 | movq %rsp, %rax; | ||
1118 | |||
1119 | vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; | ||
1120 | |||
1121 | /* load IV */ | ||
1122 | vmovdqu (%rcx), %xmm0; | ||
1123 | vpxor 0 * 16(%rdx), %xmm0, %xmm15; | ||
1124 | vmovdqu %xmm15, 15 * 16(%rax); | ||
1125 | vmovdqu %xmm0, 0 * 16(%rsi); | ||
1126 | |||
1127 | /* construct IVs */ | ||
1128 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1129 | vpxor 1 * 16(%rdx), %xmm0, %xmm15; | ||
1130 | vmovdqu %xmm15, 14 * 16(%rax); | ||
1131 | vmovdqu %xmm0, 1 * 16(%rsi); | ||
1132 | |||
1133 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1134 | vpxor 2 * 16(%rdx), %xmm0, %xmm13; | ||
1135 | vmovdqu %xmm0, 2 * 16(%rsi); | ||
1136 | |||
1137 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1138 | vpxor 3 * 16(%rdx), %xmm0, %xmm12; | ||
1139 | vmovdqu %xmm0, 3 * 16(%rsi); | ||
1140 | |||
1141 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1142 | vpxor 4 * 16(%rdx), %xmm0, %xmm11; | ||
1143 | vmovdqu %xmm0, 4 * 16(%rsi); | ||
1144 | |||
1145 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1146 | vpxor 5 * 16(%rdx), %xmm0, %xmm10; | ||
1147 | vmovdqu %xmm0, 5 * 16(%rsi); | ||
1148 | |||
1149 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1150 | vpxor 6 * 16(%rdx), %xmm0, %xmm9; | ||
1151 | vmovdqu %xmm0, 6 * 16(%rsi); | ||
1152 | |||
1153 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1154 | vpxor 7 * 16(%rdx), %xmm0, %xmm8; | ||
1155 | vmovdqu %xmm0, 7 * 16(%rsi); | ||
1156 | |||
1157 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1158 | vpxor 8 * 16(%rdx), %xmm0, %xmm7; | ||
1159 | vmovdqu %xmm0, 8 * 16(%rsi); | ||
1160 | |||
1161 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1162 | vpxor 9 * 16(%rdx), %xmm0, %xmm6; | ||
1163 | vmovdqu %xmm0, 9 * 16(%rsi); | ||
1164 | |||
1165 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1166 | vpxor 10 * 16(%rdx), %xmm0, %xmm5; | ||
1167 | vmovdqu %xmm0, 10 * 16(%rsi); | ||
1168 | |||
1169 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1170 | vpxor 11 * 16(%rdx), %xmm0, %xmm4; | ||
1171 | vmovdqu %xmm0, 11 * 16(%rsi); | ||
1172 | |||
1173 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1174 | vpxor 12 * 16(%rdx), %xmm0, %xmm3; | ||
1175 | vmovdqu %xmm0, 12 * 16(%rsi); | ||
1176 | |||
1177 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1178 | vpxor 13 * 16(%rdx), %xmm0, %xmm2; | ||
1179 | vmovdqu %xmm0, 13 * 16(%rsi); | ||
1180 | |||
1181 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1182 | vpxor 14 * 16(%rdx), %xmm0, %xmm1; | ||
1183 | vmovdqu %xmm0, 14 * 16(%rsi); | ||
1184 | |||
1185 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1186 | vpxor 15 * 16(%rdx), %xmm0, %xmm15; | ||
1187 | vmovdqu %xmm15, 0 * 16(%rax); | ||
1188 | vmovdqu %xmm0, 15 * 16(%rsi); | ||
1189 | |||
1190 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1191 | vmovdqu %xmm0, (%rcx); | ||
1192 | |||
1193 | /* inpack16_pre: */ | ||
1194 | vmovq (key_table)(CTX, %r8, 8), %xmm15; | ||
1195 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | ||
1196 | vpxor 0 * 16(%rax), %xmm15, %xmm0; | ||
1197 | vpxor %xmm1, %xmm15, %xmm1; | ||
1198 | vpxor %xmm2, %xmm15, %xmm2; | ||
1199 | vpxor %xmm3, %xmm15, %xmm3; | ||
1200 | vpxor %xmm4, %xmm15, %xmm4; | ||
1201 | vpxor %xmm5, %xmm15, %xmm5; | ||
1202 | vpxor %xmm6, %xmm15, %xmm6; | ||
1203 | vpxor %xmm7, %xmm15, %xmm7; | ||
1204 | vpxor %xmm8, %xmm15, %xmm8; | ||
1205 | vpxor %xmm9, %xmm15, %xmm9; | ||
1206 | vpxor %xmm10, %xmm15, %xmm10; | ||
1207 | vpxor %xmm11, %xmm15, %xmm11; | ||
1208 | vpxor %xmm12, %xmm15, %xmm12; | ||
1209 | vpxor %xmm13, %xmm15, %xmm13; | ||
1210 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | ||
1211 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | ||
1212 | |||
1213 | call *%r9; | ||
1214 | |||
1215 | addq $(16 * 16), %rsp; | ||
1216 | |||
1217 | vpxor 0 * 16(%rsi), %xmm7, %xmm7; | ||
1218 | vpxor 1 * 16(%rsi), %xmm6, %xmm6; | ||
1219 | vpxor 2 * 16(%rsi), %xmm5, %xmm5; | ||
1220 | vpxor 3 * 16(%rsi), %xmm4, %xmm4; | ||
1221 | vpxor 4 * 16(%rsi), %xmm3, %xmm3; | ||
1222 | vpxor 5 * 16(%rsi), %xmm2, %xmm2; | ||
1223 | vpxor 6 * 16(%rsi), %xmm1, %xmm1; | ||
1224 | vpxor 7 * 16(%rsi), %xmm0, %xmm0; | ||
1225 | vpxor 8 * 16(%rsi), %xmm15, %xmm15; | ||
1226 | vpxor 9 * 16(%rsi), %xmm14, %xmm14; | ||
1227 | vpxor 10 * 16(%rsi), %xmm13, %xmm13; | ||
1228 | vpxor 11 * 16(%rsi), %xmm12, %xmm12; | ||
1229 | vpxor 12 * 16(%rsi), %xmm11, %xmm11; | ||
1230 | vpxor 13 * 16(%rsi), %xmm10, %xmm10; | ||
1231 | vpxor 14 * 16(%rsi), %xmm9, %xmm9; | ||
1232 | vpxor 15 * 16(%rsi), %xmm8, %xmm8; | ||
1233 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
1234 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
1235 | %xmm8, %rsi); | ||
1236 | |||
1237 | ret; | ||
1238 | ENDPROC(camellia_xts_crypt_16way) | ||
1239 | |||
1240 | ENTRY(camellia_xts_enc_16way) | ||
1241 | /* input: | ||
1242 | * %rdi: ctx, CTX | ||
1243 | * %rsi: dst (16 blocks) | ||
1244 | * %rdx: src (16 blocks) | ||
1245 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1246 | */ | ||
1247 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | ||
1248 | |||
1249 | leaq __camellia_enc_blk16, %r9; | ||
1250 | |||
1251 | jmp camellia_xts_crypt_16way; | ||
1252 | ENDPROC(camellia_xts_enc_16way) | ||
1253 | |||
1254 | ENTRY(camellia_xts_dec_16way) | ||
1255 | /* input: | ||
1256 | * %rdi: ctx, CTX | ||
1257 | * %rsi: dst (16 blocks) | ||
1258 | * %rdx: src (16 blocks) | ||
1259 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1260 | */ | ||
1261 | |||
1262 | cmpl $16, key_length(CTX); | ||
1263 | movl $32, %r8d; | ||
1264 | movl $24, %eax; | ||
1265 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | ||
1266 | |||
1267 | leaq __camellia_dec_blk16, %r9; | ||
1268 | |||
1269 | jmp camellia_xts_crypt_16way; | ||
1270 | ENDPROC(camellia_xts_dec_16way) | ||