diff options
| -rw-r--r-- | arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_ssse3_glue.c | 2 |
2 files changed, 37 insertions, 32 deletions
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792db15ef..1eab79c9ac48 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S | |||
| @@ -117,11 +117,10 @@ | |||
| 117 | .set T1, REG_T1 | 117 | .set T1, REG_T1 |
| 118 | .endm | 118 | .endm |
| 119 | 119 | ||
| 120 | #define K_BASE %r8 | ||
| 121 | #define HASH_PTR %r9 | 120 | #define HASH_PTR %r9 |
| 121 | #define BLOCKS_CTR %r8 | ||
| 122 | #define BUFFER_PTR %r10 | 122 | #define BUFFER_PTR %r10 |
| 123 | #define BUFFER_PTR2 %r13 | 123 | #define BUFFER_PTR2 %r13 |
| 124 | #define BUFFER_END %r11 | ||
| 125 | 124 | ||
| 126 | #define PRECALC_BUF %r14 | 125 | #define PRECALC_BUF %r14 |
| 127 | #define WK_BUF %r15 | 126 | #define WK_BUF %r15 |
| @@ -205,14 +204,14 @@ | |||
| 205 | * blended AVX2 and ALU instruction scheduling | 204 | * blended AVX2 and ALU instruction scheduling |
| 206 | * 1 vector iteration per 8 rounds | 205 | * 1 vector iteration per 8 rounds |
| 207 | */ | 206 | */ |
| 208 | vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP | 207 | vmovdqu (i * 2)(BUFFER_PTR), W_TMP |
| 209 | .elseif ((i & 7) == 1) | 208 | .elseif ((i & 7) == 1) |
| 210 | vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ | 209 | vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ |
| 211 | WY_TMP, WY_TMP | 210 | WY_TMP, WY_TMP |
| 212 | .elseif ((i & 7) == 2) | 211 | .elseif ((i & 7) == 2) |
| 213 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY | 212 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY |
| 214 | .elseif ((i & 7) == 4) | 213 | .elseif ((i & 7) == 4) |
| 215 | vpaddd K_XMM(K_BASE), WY, WY_TMP | 214 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP |
| 216 | .elseif ((i & 7) == 7) | 215 | .elseif ((i & 7) == 7) |
| 217 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | 216 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
| 218 | 217 | ||
| @@ -255,7 +254,7 @@ | |||
| 255 | vpxor WY, WY_TMP, WY_TMP | 254 | vpxor WY, WY_TMP, WY_TMP |
| 256 | .elseif ((i & 7) == 7) | 255 | .elseif ((i & 7) == 7) |
| 257 | vpxor WY_TMP2, WY_TMP, WY | 256 | vpxor WY_TMP2, WY_TMP, WY |
| 258 | vpaddd K_XMM(K_BASE), WY, WY_TMP | 257 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP |
| 259 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | 258 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
| 260 | 259 | ||
| 261 | PRECALC_ROTATE_WY | 260 | PRECALC_ROTATE_WY |
| @@ -291,7 +290,7 @@ | |||
| 291 | vpsrld $30, WY, WY | 290 | vpsrld $30, WY, WY |
| 292 | vpor WY, WY_TMP, WY | 291 | vpor WY, WY_TMP, WY |
| 293 | .elseif ((i & 7) == 7) | 292 | .elseif ((i & 7) == 7) |
| 294 | vpaddd K_XMM(K_BASE), WY, WY_TMP | 293 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP |
| 295 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | 294 | vmovdqu WY_TMP, PRECALC_WK(i&~7) |
| 296 | 295 | ||
| 297 | PRECALC_ROTATE_WY | 296 | PRECALC_ROTATE_WY |
| @@ -446,6 +445,16 @@ | |||
| 446 | 445 | ||
| 447 | .endm | 446 | .endm |
| 448 | 447 | ||
| 448 | /* Add constant only if (%2 > %3) condition met (uses RTA as temp) | ||
| 449 | * %1 + %2 >= %3 ? %4 : 0 | ||
| 450 | */ | ||
| 451 | .macro ADD_IF_GE a, b, c, d | ||
| 452 | mov \a, RTA | ||
| 453 | add $\d, RTA | ||
| 454 | cmp $\c, \b | ||
| 455 | cmovge RTA, \a | ||
| 456 | .endm | ||
| 457 | |||
| 449 | /* | 458 | /* |
| 450 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining | 459 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining |
| 451 | */ | 460 | */ |
| @@ -463,13 +472,16 @@ | |||
| 463 | lea (2*4*80+32)(%rsp), WK_BUF | 472 | lea (2*4*80+32)(%rsp), WK_BUF |
| 464 | 473 | ||
| 465 | # Precalc WK for first 2 blocks | 474 | # Precalc WK for first 2 blocks |
| 466 | PRECALC_OFFSET = 0 | 475 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 |
| 467 | .set i, 0 | 476 | .set i, 0 |
| 468 | .rept 160 | 477 | .rept 160 |
| 469 | PRECALC i | 478 | PRECALC i |
| 470 | .set i, i + 1 | 479 | .set i, i + 1 |
| 471 | .endr | 480 | .endr |
| 472 | PRECALC_OFFSET = 128 | 481 | |
| 482 | /* Go to next block if needed */ | ||
| 483 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 | ||
| 484 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 | ||
| 473 | xchg WK_BUF, PRECALC_BUF | 485 | xchg WK_BUF, PRECALC_BUF |
| 474 | 486 | ||
| 475 | .align 32 | 487 | .align 32 |
| @@ -479,8 +491,8 @@ _loop: | |||
| 479 | * we use K_BASE value as a signal of a last block, | 491 | * we use K_BASE value as a signal of a last block, |
| 480 | * it is set below by: cmovae BUFFER_PTR, K_BASE | 492 | * it is set below by: cmovae BUFFER_PTR, K_BASE |
| 481 | */ | 493 | */ |
| 482 | cmp K_BASE, BUFFER_PTR | 494 | test BLOCKS_CTR, BLOCKS_CTR |
| 483 | jne _begin | 495 | jnz _begin |
| 484 | .align 32 | 496 | .align 32 |
| 485 | jmp _end | 497 | jmp _end |
| 486 | .align 32 | 498 | .align 32 |
| @@ -512,10 +524,10 @@ _loop0: | |||
| 512 | .set j, j+2 | 524 | .set j, j+2 |
| 513 | .endr | 525 | .endr |
| 514 | 526 | ||
| 515 | add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ | 527 | /* Update Counter */ |
| 516 | cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ | 528 | sub $1, BLOCKS_CTR |
| 517 | cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ | 529 | /* Move to the next block only if needed*/ |
| 518 | 530 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 | |
| 519 | /* | 531 | /* |
| 520 | * rounds | 532 | * rounds |
| 521 | * 60,62,64,66,68 | 533 | * 60,62,64,66,68 |
| @@ -532,8 +544,8 @@ _loop0: | |||
| 532 | UPDATE_HASH 12(HASH_PTR), D | 544 | UPDATE_HASH 12(HASH_PTR), D |
| 533 | UPDATE_HASH 16(HASH_PTR), E | 545 | UPDATE_HASH 16(HASH_PTR), E |
| 534 | 546 | ||
| 535 | cmp K_BASE, BUFFER_PTR /* is current block the last one? */ | 547 | test BLOCKS_CTR, BLOCKS_CTR |
| 536 | je _loop | 548 | jz _loop |
| 537 | 549 | ||
| 538 | mov TB, B | 550 | mov TB, B |
| 539 | 551 | ||
| @@ -575,10 +587,10 @@ _loop2: | |||
| 575 | .set j, j+2 | 587 | .set j, j+2 |
| 576 | .endr | 588 | .endr |
| 577 | 589 | ||
| 578 | add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ | 590 | /* update counter */ |
| 579 | 591 | sub $1, BLOCKS_CTR | |
| 580 | cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ | 592 | /* Move to the next block only if needed*/ |
| 581 | cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ | 593 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 |
| 582 | 594 | ||
| 583 | jmp _loop3 | 595 | jmp _loop3 |
| 584 | _loop3: | 596 | _loop3: |
| @@ -641,19 +653,12 @@ _loop3: | |||
| 641 | 653 | ||
| 642 | avx2_zeroupper | 654 | avx2_zeroupper |
| 643 | 655 | ||
| 644 | lea K_XMM_AR(%rip), K_BASE | 656 | /* Setup initial values */ |
| 645 | |||
| 646 | mov CTX, HASH_PTR | 657 | mov CTX, HASH_PTR |
| 647 | mov BUF, BUFFER_PTR | 658 | mov BUF, BUFFER_PTR |
| 648 | lea 64(BUF), BUFFER_PTR2 | ||
| 649 | |||
| 650 | shl $6, CNT /* mul by 64 */ | ||
| 651 | add BUF, CNT | ||
| 652 | add $64, CNT | ||
| 653 | mov CNT, BUFFER_END | ||
| 654 | 659 | ||
| 655 | cmp BUFFER_END, BUFFER_PTR2 | 660 | mov BUF, BUFFER_PTR2 |
| 656 | cmovae K_BASE, BUFFER_PTR2 | 661 | mov CNT, BLOCKS_CTR |
| 657 | 662 | ||
| 658 | xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP | 663 | xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP |
| 659 | 664 | ||
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index f960a043cdeb..fc61739150e7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c | |||
| @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, | |||
| 201 | 201 | ||
| 202 | static bool avx2_usable(void) | 202 | static bool avx2_usable(void) |
| 203 | { | 203 | { |
| 204 | if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) | 204 | if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) |
| 205 | && boot_cpu_has(X86_FEATURE_BMI1) | 205 | && boot_cpu_has(X86_FEATURE_BMI1) |
| 206 | && boot_cpu_has(X86_FEATURE_BMI2)) | 206 | && boot_cpu_has(X86_FEATURE_BMI2)) |
| 207 | return true; | 207 | return true; |
