diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 17:53:12 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 17:53:12 -0400 |
commit | 797994f81a8b2bdca2eecffa415c1e7a89a4f961 (patch) | |
tree | 1383dc469c26ad37fdf960f682d9a48c782935c5 /arch | |
parent | c8d8566952fda026966784a62f324c8352f77430 (diff) | |
parent | 3862de1f6c442d53bd828d39f86d07d933a70605 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
- XTS mode optimisation for twofish/cast6/camellia/aes on x86
- AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia
- SSSE3/AVX/AVX2 optimisations for sha256/sha512
- Added driver for SAHARA2 crypto accelerator
- Fix for GMAC when used in non-IPsec secnarios
- Added generic CMAC implementation (including IPsec glue)
- IP update for crypto/atmel
- Support for more than one device in hwrng/timeriomem
- Added Broadcom BCM2835 RNG driver
- Misc fixes
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits)
crypto: caam - fix job ring cleanup code
crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher
crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher
crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher
crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher
crypto: tcrypt - add async cipher speed tests for blowfish
crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2
crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86
crypto: aesni_intel - add more optimized XTS mode for x86-64
crypto: x86/camellia-aesni-avx - add more optimized XTS code
crypto: cast6-avx: use new optimized XTS code
crypto: x86/twofish-avx - use optimized XTS code
crypto: x86 - add more optimized XTS-mode for serpent-avx
xfrm: add rfc4494 AES-CMAC-96 support
crypto: add CMAC support to CryptoAPI
crypto: testmgr - add empty test vectors for null ciphers
crypto: testmgr - add AES GMAC test vectors
crypto: gcm - fix rfc4543 to handle async crypto correctly
crypto: gcm - make GMAC work when dst and src are different
hwrng: timeriomem - added devicetree hooks
...
Diffstat (limited to 'arch')
40 files changed, 10762 insertions, 240 deletions
diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c index 827c9f2a70fb..f0bf68268ca2 100644 --- a/arch/arm/mach-at91/at91sam9g45_devices.c +++ b/arch/arm/mach-at91/at91sam9g45_devices.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <linux/platform_device.h> | 18 | #include <linux/platform_device.h> |
19 | #include <linux/i2c-gpio.h> | 19 | #include <linux/i2c-gpio.h> |
20 | #include <linux/atmel-mci.h> | 20 | #include <linux/atmel-mci.h> |
21 | #include <linux/platform_data/atmel-aes.h> | 21 | #include <linux/platform_data/crypto-atmel.h> |
22 | 22 | ||
23 | #include <linux/platform_data/at91_adc.h> | 23 | #include <linux/platform_data/at91_adc.h> |
24 | 24 | ||
@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {} | |||
1900 | * -------------------------------------------------------------------- */ | 1900 | * -------------------------------------------------------------------- */ |
1901 | 1901 | ||
1902 | #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE) | 1902 | #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE) |
1903 | static struct aes_platform_data aes_data; | 1903 | static struct crypto_platform_data aes_data; |
1904 | static struct crypto_dma_data alt_atslave; | ||
1904 | static u64 aes_dmamask = DMA_BIT_MASK(32); | 1905 | static u64 aes_dmamask = DMA_BIT_MASK(32); |
1905 | 1906 | ||
1906 | static struct resource aes_resources[] = { | 1907 | static struct resource aes_resources[] = { |
@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = { | |||
1931 | static void __init at91_add_device_aes(void) | 1932 | static void __init at91_add_device_aes(void) |
1932 | { | 1933 | { |
1933 | struct at_dma_slave *atslave; | 1934 | struct at_dma_slave *atslave; |
1934 | struct aes_dma_data *alt_atslave; | ||
1935 | |||
1936 | alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL); | ||
1937 | 1935 | ||
1938 | /* DMA TX slave channel configuration */ | 1936 | /* DMA TX slave channel configuration */ |
1939 | atslave = &alt_atslave->txdata; | 1937 | atslave = &alt_atslave.txdata; |
1940 | atslave->dma_dev = &at_hdmac_device.dev; | 1938 | atslave->dma_dev = &at_hdmac_device.dev; |
1941 | atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW | | 1939 | atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW | |
1942 | ATC_SRC_PER(AT_DMA_ID_AES_RX); | 1940 | ATC_SRC_PER(AT_DMA_ID_AES_RX); |
1943 | 1941 | ||
1944 | /* DMA RX slave channel configuration */ | 1942 | /* DMA RX slave channel configuration */ |
1945 | atslave = &alt_atslave->rxdata; | 1943 | atslave = &alt_atslave.rxdata; |
1946 | atslave->dma_dev = &at_hdmac_device.dev; | 1944 | atslave->dma_dev = &at_hdmac_device.dev; |
1947 | atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW | | 1945 | atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW | |
1948 | ATC_DST_PER(AT_DMA_ID_AES_TX); | 1946 | ATC_DST_PER(AT_DMA_ID_AES_TX); |
1949 | 1947 | ||
1950 | aes_data.dma_slave = alt_atslave; | 1948 | aes_data.dma_slave = &alt_atslave; |
1951 | platform_device_register(&at91sam9g45_aes_device); | 1949 | platform_device_register(&at91sam9g45_aes_device); |
1952 | } | 1950 | } |
1953 | #else | 1951 | #else |
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 63947a8f9f0f..a3a0ed80f17c 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -2,6 +2,10 @@ | |||
2 | # Arch-specific CryptoAPI modules. | 2 | # Arch-specific CryptoAPI modules. |
3 | # | 3 | # |
4 | 4 | ||
5 | avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) | ||
6 | avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ | ||
7 | $(comma)4)$(comma)%ymm2,yes,no) | ||
8 | |||
5 | obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o | 9 | obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o |
6 | obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o | 10 | obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o |
7 | 11 | ||
@@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o | |||
12 | 16 | ||
13 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o | 17 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o |
14 | obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o | 18 | obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o |
15 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o | ||
16 | obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o | ||
17 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o | ||
18 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o | 19 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o |
19 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o | 20 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o |
20 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o | 21 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o |
21 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o | ||
22 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o | 22 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o |
23 | obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o | 23 | obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o |
24 | obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o | ||
25 | obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o | 24 | obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o |
26 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o | 25 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o |
27 | 26 | ||
28 | obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o | 27 | obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o |
29 | obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o | 28 | obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o |
30 | obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o | 29 | obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o |
30 | obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o | ||
31 | obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o | ||
32 | |||
33 | # These modules require assembler to support AVX. | ||
34 | ifeq ($(avx_supported),yes) | ||
35 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ | ||
36 | camellia-aesni-avx-x86_64.o | ||
37 | obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o | ||
38 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o | ||
39 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o | ||
40 | obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o | ||
41 | endif | ||
42 | |||
43 | # These modules require assembler to support AVX2. | ||
44 | ifeq ($(avx2_supported),yes) | ||
45 | obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o | ||
46 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o | ||
47 | obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o | ||
48 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o | ||
49 | endif | ||
31 | 50 | ||
32 | aes-i586-y := aes-i586-asm_32.o aes_glue.o | 51 | aes-i586-y := aes-i586-asm_32.o aes_glue.o |
33 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o | 52 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o |
@@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o | |||
36 | 55 | ||
37 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o | 56 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o |
38 | camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o | 57 | camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o |
39 | camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ | ||
40 | camellia_aesni_avx_glue.o | ||
41 | cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o | ||
42 | cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o | ||
43 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o | 58 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o |
44 | twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o | 59 | twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o |
45 | twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o | 60 | twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o |
46 | twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o | ||
47 | salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o | 61 | salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o |
48 | serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o | 62 | serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o |
49 | serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o | 63 | |
64 | ifeq ($(avx_supported),yes) | ||
65 | camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ | ||
66 | camellia_aesni_avx_glue.o | ||
67 | cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o | ||
68 | cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o | ||
69 | twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \ | ||
70 | twofish_avx_glue.o | ||
71 | serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \ | ||
72 | serpent_avx_glue.o | ||
73 | endif | ||
74 | |||
75 | ifeq ($(avx2_supported),yes) | ||
76 | blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o | ||
77 | camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o | ||
78 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o | ||
79 | twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o | ||
80 | endif | ||
50 | 81 | ||
51 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 82 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o |
52 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | 83 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
53 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | 84 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o |
54 | crc32c-intel-y := crc32c-intel_glue.o | 85 | crc32c-intel-y := crc32c-intel_glue.o |
55 | crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o | 86 | crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o |
56 | crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o | 87 | crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o |
88 | sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o | ||
89 | sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o | ||
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 04b797767b9e..62fe22cd4cba 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S | |||
@@ -34,6 +34,10 @@ | |||
34 | 34 | ||
35 | #ifdef __x86_64__ | 35 | #ifdef __x86_64__ |
36 | .data | 36 | .data |
37 | .align 16 | ||
38 | .Lgf128mul_x_ble_mask: | ||
39 | .octa 0x00000000000000010000000000000087 | ||
40 | |||
37 | POLY: .octa 0xC2000000000000000000000000000001 | 41 | POLY: .octa 0xC2000000000000000000000000000001 |
38 | TWOONE: .octa 0x00000001000000000000000000000001 | 42 | TWOONE: .octa 0x00000001000000000000000000000001 |
39 | 43 | ||
@@ -105,6 +109,8 @@ enc: .octa 0x2 | |||
105 | #define CTR %xmm11 | 109 | #define CTR %xmm11 |
106 | #define INC %xmm12 | 110 | #define INC %xmm12 |
107 | 111 | ||
112 | #define GF128MUL_MASK %xmm10 | ||
113 | |||
108 | #ifdef __x86_64__ | 114 | #ifdef __x86_64__ |
109 | #define AREG %rax | 115 | #define AREG %rax |
110 | #define KEYP %rdi | 116 | #define KEYP %rdi |
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc) | |||
2636 | .Lctr_enc_just_ret: | 2642 | .Lctr_enc_just_ret: |
2637 | ret | 2643 | ret |
2638 | ENDPROC(aesni_ctr_enc) | 2644 | ENDPROC(aesni_ctr_enc) |
2645 | |||
2646 | /* | ||
2647 | * _aesni_gf128mul_x_ble: internal ABI | ||
2648 | * Multiply in GF(2^128) for XTS IVs | ||
2649 | * input: | ||
2650 | * IV: current IV | ||
2651 | * GF128MUL_MASK == mask with 0x87 and 0x01 | ||
2652 | * output: | ||
2653 | * IV: next IV | ||
2654 | * changed: | ||
2655 | * CTR: == temporary value | ||
2656 | */ | ||
2657 | #define _aesni_gf128mul_x_ble() \ | ||
2658 | pshufd $0x13, IV, CTR; \ | ||
2659 | paddq IV, IV; \ | ||
2660 | psrad $31, CTR; \ | ||
2661 | pand GF128MUL_MASK, CTR; \ | ||
2662 | pxor CTR, IV; | ||
2663 | |||
2664 | /* | ||
2665 | * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, | ||
2666 | * bool enc, u8 *iv) | ||
2667 | */ | ||
2668 | ENTRY(aesni_xts_crypt8) | ||
2669 | cmpb $0, %cl | ||
2670 | movl $0, %ecx | ||
2671 | movl $240, %r10d | ||
2672 | leaq _aesni_enc4, %r11 | ||
2673 | leaq _aesni_dec4, %rax | ||
2674 | cmovel %r10d, %ecx | ||
2675 | cmoveq %rax, %r11 | ||
2676 | |||
2677 | movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK | ||
2678 | movups (IVP), IV | ||
2679 | |||
2680 | mov 480(KEYP), KLEN | ||
2681 | addq %rcx, KEYP | ||
2682 | |||
2683 | movdqa IV, STATE1 | ||
2684 | pxor 0x00(INP), STATE1 | ||
2685 | movdqu IV, 0x00(OUTP) | ||
2686 | |||
2687 | _aesni_gf128mul_x_ble() | ||
2688 | movdqa IV, STATE2 | ||
2689 | pxor 0x10(INP), STATE2 | ||
2690 | movdqu IV, 0x10(OUTP) | ||
2691 | |||
2692 | _aesni_gf128mul_x_ble() | ||
2693 | movdqa IV, STATE3 | ||
2694 | pxor 0x20(INP), STATE3 | ||
2695 | movdqu IV, 0x20(OUTP) | ||
2696 | |||
2697 | _aesni_gf128mul_x_ble() | ||
2698 | movdqa IV, STATE4 | ||
2699 | pxor 0x30(INP), STATE4 | ||
2700 | movdqu IV, 0x30(OUTP) | ||
2701 | |||
2702 | call *%r11 | ||
2703 | |||
2704 | pxor 0x00(OUTP), STATE1 | ||
2705 | movdqu STATE1, 0x00(OUTP) | ||
2706 | |||
2707 | _aesni_gf128mul_x_ble() | ||
2708 | movdqa IV, STATE1 | ||
2709 | pxor 0x40(INP), STATE1 | ||
2710 | movdqu IV, 0x40(OUTP) | ||
2711 | |||
2712 | pxor 0x10(OUTP), STATE2 | ||
2713 | movdqu STATE2, 0x10(OUTP) | ||
2714 | |||
2715 | _aesni_gf128mul_x_ble() | ||
2716 | movdqa IV, STATE2 | ||
2717 | pxor 0x50(INP), STATE2 | ||
2718 | movdqu IV, 0x50(OUTP) | ||
2719 | |||
2720 | pxor 0x20(OUTP), STATE3 | ||
2721 | movdqu STATE3, 0x20(OUTP) | ||
2722 | |||
2723 | _aesni_gf128mul_x_ble() | ||
2724 | movdqa IV, STATE3 | ||
2725 | pxor 0x60(INP), STATE3 | ||
2726 | movdqu IV, 0x60(OUTP) | ||
2727 | |||
2728 | pxor 0x30(OUTP), STATE4 | ||
2729 | movdqu STATE4, 0x30(OUTP) | ||
2730 | |||
2731 | _aesni_gf128mul_x_ble() | ||
2732 | movdqa IV, STATE4 | ||
2733 | pxor 0x70(INP), STATE4 | ||
2734 | movdqu IV, 0x70(OUTP) | ||
2735 | |||
2736 | _aesni_gf128mul_x_ble() | ||
2737 | movups IV, (IVP) | ||
2738 | |||
2739 | call *%r11 | ||
2740 | |||
2741 | pxor 0x40(OUTP), STATE1 | ||
2742 | movdqu STATE1, 0x40(OUTP) | ||
2743 | |||
2744 | pxor 0x50(OUTP), STATE2 | ||
2745 | movdqu STATE2, 0x50(OUTP) | ||
2746 | |||
2747 | pxor 0x60(OUTP), STATE3 | ||
2748 | movdqu STATE3, 0x60(OUTP) | ||
2749 | |||
2750 | pxor 0x70(OUTP), STATE4 | ||
2751 | movdqu STATE4, 0x70(OUTP) | ||
2752 | |||
2753 | ret | ||
2754 | ENDPROC(aesni_xts_crypt8) | ||
2755 | |||
2639 | #endif | 2756 | #endif |
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index a0795da22c02..f80e668785c0 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -39,6 +39,9 @@ | |||
39 | #include <crypto/internal/aead.h> | 39 | #include <crypto/internal/aead.h> |
40 | #include <linux/workqueue.h> | 40 | #include <linux/workqueue.h> |
41 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #ifdef CONFIG_X86_64 | ||
43 | #include <asm/crypto/glue_helper.h> | ||
44 | #endif | ||
42 | 45 | ||
43 | #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) | 46 | #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) |
44 | #define HAS_PCBC | 47 | #define HAS_PCBC |
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void); | |||
102 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, | 105 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, |
103 | const u8 *in, unsigned int len, u8 *iv); | 106 | const u8 *in, unsigned int len, u8 *iv); |
104 | 107 | ||
108 | asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, | ||
109 | const u8 *in, bool enc, u8 *iv); | ||
110 | |||
105 | /* asmlinkage void aesni_gcm_enc() | 111 | /* asmlinkage void aesni_gcm_enc() |
106 | * void *ctx, AES Key schedule. Starts on a 16 byte boundary. | 112 | * void *ctx, AES Key schedule. Starts on a 16 byte boundary. |
107 | * u8 *out, Ciphertext output. Encrypt in-place is allowed. | 113 | * u8 *out, Ciphertext output. Encrypt in-place is allowed. |
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) | |||
510 | aesni_enc(ctx, out, in); | 516 | aesni_enc(ctx, out, in); |
511 | } | 517 | } |
512 | 518 | ||
519 | #ifdef CONFIG_X86_64 | ||
520 | |||
521 | static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
522 | { | ||
523 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); | ||
524 | } | ||
525 | |||
526 | static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
527 | { | ||
528 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); | ||
529 | } | ||
530 | |||
531 | static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
532 | { | ||
533 | aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); | ||
534 | } | ||
535 | |||
536 | static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
537 | { | ||
538 | aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); | ||
539 | } | ||
540 | |||
541 | static const struct common_glue_ctx aesni_enc_xts = { | ||
542 | .num_funcs = 2, | ||
543 | .fpu_blocks_limit = 1, | ||
544 | |||
545 | .funcs = { { | ||
546 | .num_blocks = 8, | ||
547 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } | ||
548 | }, { | ||
549 | .num_blocks = 1, | ||
550 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } | ||
551 | } } | ||
552 | }; | ||
553 | |||
554 | static const struct common_glue_ctx aesni_dec_xts = { | ||
555 | .num_funcs = 2, | ||
556 | .fpu_blocks_limit = 1, | ||
557 | |||
558 | .funcs = { { | ||
559 | .num_blocks = 8, | ||
560 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } | ||
561 | }, { | ||
562 | .num_blocks = 1, | ||
563 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } | ||
564 | } } | ||
565 | }; | ||
566 | |||
567 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
568 | struct scatterlist *src, unsigned int nbytes) | ||
569 | { | ||
570 | struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
571 | |||
572 | return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, | ||
573 | XTS_TWEAK_CAST(aesni_xts_tweak), | ||
574 | aes_ctx(ctx->raw_tweak_ctx), | ||
575 | aes_ctx(ctx->raw_crypt_ctx)); | ||
576 | } | ||
577 | |||
578 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
579 | struct scatterlist *src, unsigned int nbytes) | ||
580 | { | ||
581 | struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
582 | |||
583 | return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, | ||
584 | XTS_TWEAK_CAST(aesni_xts_tweak), | ||
585 | aes_ctx(ctx->raw_tweak_ctx), | ||
586 | aes_ctx(ctx->raw_crypt_ctx)); | ||
587 | } | ||
588 | |||
589 | #else | ||
590 | |||
513 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 591 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
514 | struct scatterlist *src, unsigned int nbytes) | 592 | struct scatterlist *src, unsigned int nbytes) |
515 | { | 593 | { |
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
560 | return ret; | 638 | return ret; |
561 | } | 639 | } |
562 | 640 | ||
641 | #endif | ||
642 | |||
563 | #ifdef CONFIG_X86_64 | 643 | #ifdef CONFIG_X86_64 |
564 | static int rfc4106_init(struct crypto_tfm *tfm) | 644 | static int rfc4106_init(struct crypto_tfm *tfm) |
565 | { | 645 | { |
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S new file mode 100644 index 000000000000..784452e0d05d --- /dev/null +++ b/arch/x86/crypto/blowfish-avx2-asm_64.S | |||
@@ -0,0 +1,449 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2 assembler optimized version of Blowfish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | |||
15 | .file "blowfish-avx2-asm_64.S" | ||
16 | |||
17 | .data | ||
18 | .align 32 | ||
19 | |||
20 | .Lprefetch_mask: | ||
21 | .long 0*64 | ||
22 | .long 1*64 | ||
23 | .long 2*64 | ||
24 | .long 3*64 | ||
25 | .long 4*64 | ||
26 | .long 5*64 | ||
27 | .long 6*64 | ||
28 | .long 7*64 | ||
29 | |||
30 | .Lbswap32_mask: | ||
31 | .long 0x00010203 | ||
32 | .long 0x04050607 | ||
33 | .long 0x08090a0b | ||
34 | .long 0x0c0d0e0f | ||
35 | |||
36 | .Lbswap128_mask: | ||
37 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
38 | .Lbswap_iv_mask: | ||
39 | .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 | ||
40 | |||
41 | .text | ||
42 | /* structure of crypto context */ | ||
43 | #define p 0 | ||
44 | #define s0 ((16 + 2) * 4) | ||
45 | #define s1 ((16 + 2 + (1 * 256)) * 4) | ||
46 | #define s2 ((16 + 2 + (2 * 256)) * 4) | ||
47 | #define s3 ((16 + 2 + (3 * 256)) * 4) | ||
48 | |||
49 | /* register macros */ | ||
50 | #define CTX %rdi | ||
51 | #define RIO %rdx | ||
52 | |||
53 | #define RS0 %rax | ||
54 | #define RS1 %r8 | ||
55 | #define RS2 %r9 | ||
56 | #define RS3 %r10 | ||
57 | |||
58 | #define RLOOP %r11 | ||
59 | #define RLOOPd %r11d | ||
60 | |||
61 | #define RXr0 %ymm8 | ||
62 | #define RXr1 %ymm9 | ||
63 | #define RXr2 %ymm10 | ||
64 | #define RXr3 %ymm11 | ||
65 | #define RXl0 %ymm12 | ||
66 | #define RXl1 %ymm13 | ||
67 | #define RXl2 %ymm14 | ||
68 | #define RXl3 %ymm15 | ||
69 | |||
70 | /* temp regs */ | ||
71 | #define RT0 %ymm0 | ||
72 | #define RT0x %xmm0 | ||
73 | #define RT1 %ymm1 | ||
74 | #define RT1x %xmm1 | ||
75 | #define RIDX0 %ymm2 | ||
76 | #define RIDX1 %ymm3 | ||
77 | #define RIDX1x %xmm3 | ||
78 | #define RIDX2 %ymm4 | ||
79 | #define RIDX3 %ymm5 | ||
80 | |||
81 | /* vpgatherdd mask and '-1' */ | ||
82 | #define RNOT %ymm6 | ||
83 | |||
84 | /* byte mask, (-1 >> 24) */ | ||
85 | #define RBYTE %ymm7 | ||
86 | |||
87 | /*********************************************************************** | ||
88 | * 32-way AVX2 blowfish | ||
89 | ***********************************************************************/ | ||
90 | #define F(xl, xr) \ | ||
91 | vpsrld $24, xl, RIDX0; \ | ||
92 | vpsrld $16, xl, RIDX1; \ | ||
93 | vpsrld $8, xl, RIDX2; \ | ||
94 | vpand RBYTE, RIDX1, RIDX1; \ | ||
95 | vpand RBYTE, RIDX2, RIDX2; \ | ||
96 | vpand RBYTE, xl, RIDX3; \ | ||
97 | \ | ||
98 | vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ | ||
99 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
100 | vpcmpeqd RIDX0, RIDX0, RIDX0; \ | ||
101 | \ | ||
102 | vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ | ||
103 | vpcmpeqd RIDX1, RIDX1, RIDX1; \ | ||
104 | vpaddd RT0, RT1, RT0; \ | ||
105 | \ | ||
106 | vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ | ||
107 | vpxor RT0, RT1, RT0; \ | ||
108 | \ | ||
109 | vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ | ||
110 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
111 | vpaddd RT0, RT1, RT0; \ | ||
112 | \ | ||
113 | vpxor RT0, xr, xr; | ||
114 | |||
115 | #define add_roundkey(xl, nmem) \ | ||
116 | vpbroadcastd nmem, RT0; \ | ||
117 | vpxor RT0, xl ## 0, xl ## 0; \ | ||
118 | vpxor RT0, xl ## 1, xl ## 1; \ | ||
119 | vpxor RT0, xl ## 2, xl ## 2; \ | ||
120 | vpxor RT0, xl ## 3, xl ## 3; | ||
121 | |||
122 | #define round_enc() \ | ||
123 | add_roundkey(RXr, p(CTX,RLOOP,4)); \ | ||
124 | F(RXl0, RXr0); \ | ||
125 | F(RXl1, RXr1); \ | ||
126 | F(RXl2, RXr2); \ | ||
127 | F(RXl3, RXr3); \ | ||
128 | \ | ||
129 | add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ | ||
130 | F(RXr0, RXl0); \ | ||
131 | F(RXr1, RXl1); \ | ||
132 | F(RXr2, RXl2); \ | ||
133 | F(RXr3, RXl3); | ||
134 | |||
135 | #define round_dec() \ | ||
136 | add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ | ||
137 | F(RXl0, RXr0); \ | ||
138 | F(RXl1, RXr1); \ | ||
139 | F(RXl2, RXr2); \ | ||
140 | F(RXl3, RXr3); \ | ||
141 | \ | ||
142 | add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ | ||
143 | F(RXr0, RXl0); \ | ||
144 | F(RXr1, RXl1); \ | ||
145 | F(RXr2, RXl2); \ | ||
146 | F(RXr3, RXl3); | ||
147 | |||
148 | #define init_round_constants() \ | ||
149 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
150 | leaq s0(CTX), RS0; \ | ||
151 | leaq s1(CTX), RS1; \ | ||
152 | leaq s2(CTX), RS2; \ | ||
153 | leaq s3(CTX), RS3; \ | ||
154 | vpsrld $24, RNOT, RBYTE; | ||
155 | |||
156 | #define transpose_2x2(x0, x1, t0) \ | ||
157 | vpunpckldq x0, x1, t0; \ | ||
158 | vpunpckhdq x0, x1, x1; \ | ||
159 | \ | ||
160 | vpunpcklqdq t0, x1, x0; \ | ||
161 | vpunpckhqdq t0, x1, x1; | ||
162 | |||
163 | #define read_block(xl, xr) \ | ||
164 | vbroadcasti128 .Lbswap32_mask, RT1; \ | ||
165 | \ | ||
166 | vpshufb RT1, xl ## 0, xl ## 0; \ | ||
167 | vpshufb RT1, xr ## 0, xr ## 0; \ | ||
168 | vpshufb RT1, xl ## 1, xl ## 1; \ | ||
169 | vpshufb RT1, xr ## 1, xr ## 1; \ | ||
170 | vpshufb RT1, xl ## 2, xl ## 2; \ | ||
171 | vpshufb RT1, xr ## 2, xr ## 2; \ | ||
172 | vpshufb RT1, xl ## 3, xl ## 3; \ | ||
173 | vpshufb RT1, xr ## 3, xr ## 3; \ | ||
174 | \ | ||
175 | transpose_2x2(xl ## 0, xr ## 0, RT0); \ | ||
176 | transpose_2x2(xl ## 1, xr ## 1, RT0); \ | ||
177 | transpose_2x2(xl ## 2, xr ## 2, RT0); \ | ||
178 | transpose_2x2(xl ## 3, xr ## 3, RT0); | ||
179 | |||
180 | #define write_block(xl, xr) \ | ||
181 | vbroadcasti128 .Lbswap32_mask, RT1; \ | ||
182 | \ | ||
183 | transpose_2x2(xl ## 0, xr ## 0, RT0); \ | ||
184 | transpose_2x2(xl ## 1, xr ## 1, RT0); \ | ||
185 | transpose_2x2(xl ## 2, xr ## 2, RT0); \ | ||
186 | transpose_2x2(xl ## 3, xr ## 3, RT0); \ | ||
187 | \ | ||
188 | vpshufb RT1, xl ## 0, xl ## 0; \ | ||
189 | vpshufb RT1, xr ## 0, xr ## 0; \ | ||
190 | vpshufb RT1, xl ## 1, xl ## 1; \ | ||
191 | vpshufb RT1, xr ## 1, xr ## 1; \ | ||
192 | vpshufb RT1, xl ## 2, xl ## 2; \ | ||
193 | vpshufb RT1, xr ## 2, xr ## 2; \ | ||
194 | vpshufb RT1, xl ## 3, xl ## 3; \ | ||
195 | vpshufb RT1, xr ## 3, xr ## 3; | ||
196 | |||
197 | .align 8 | ||
198 | __blowfish_enc_blk32: | ||
199 | /* input: | ||
200 | * %rdi: ctx, CTX | ||
201 | * RXl0..4, RXr0..4: plaintext | ||
202 | * output: | ||
203 | * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) | ||
204 | */ | ||
205 | init_round_constants(); | ||
206 | |||
207 | read_block(RXl, RXr); | ||
208 | |||
209 | movl $1, RLOOPd; | ||
210 | add_roundkey(RXl, p+4*(0)(CTX)); | ||
211 | |||
212 | .align 4 | ||
213 | .L__enc_loop: | ||
214 | round_enc(); | ||
215 | |||
216 | leal 2(RLOOPd), RLOOPd; | ||
217 | cmpl $17, RLOOPd; | ||
218 | jne .L__enc_loop; | ||
219 | |||
220 | add_roundkey(RXr, p+4*(17)(CTX)); | ||
221 | |||
222 | write_block(RXl, RXr); | ||
223 | |||
224 | ret; | ||
225 | ENDPROC(__blowfish_enc_blk32) | ||
226 | |||
227 | .align 8 | ||
228 | __blowfish_dec_blk32: | ||
229 | /* input: | ||
230 | * %rdi: ctx, CTX | ||
231 | * RXl0..4, RXr0..4: ciphertext | ||
232 | * output: | ||
233 | * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) | ||
234 | */ | ||
235 | init_round_constants(); | ||
236 | |||
237 | read_block(RXl, RXr); | ||
238 | |||
239 | movl $14, RLOOPd; | ||
240 | add_roundkey(RXl, p+4*(17)(CTX)); | ||
241 | |||
242 | .align 4 | ||
243 | .L__dec_loop: | ||
244 | round_dec(); | ||
245 | |||
246 | addl $-2, RLOOPd; | ||
247 | jns .L__dec_loop; | ||
248 | |||
249 | add_roundkey(RXr, p+4*(0)(CTX)); | ||
250 | |||
251 | write_block(RXl, RXr); | ||
252 | |||
253 | ret; | ||
254 | ENDPROC(__blowfish_dec_blk32) | ||
255 | |||
256 | ENTRY(blowfish_ecb_enc_32way) | ||
257 | /* input: | ||
258 | * %rdi: ctx, CTX | ||
259 | * %rsi: dst | ||
260 | * %rdx: src | ||
261 | */ | ||
262 | |||
263 | vzeroupper; | ||
264 | |||
265 | vmovdqu 0*32(%rdx), RXl0; | ||
266 | vmovdqu 1*32(%rdx), RXr0; | ||
267 | vmovdqu 2*32(%rdx), RXl1; | ||
268 | vmovdqu 3*32(%rdx), RXr1; | ||
269 | vmovdqu 4*32(%rdx), RXl2; | ||
270 | vmovdqu 5*32(%rdx), RXr2; | ||
271 | vmovdqu 6*32(%rdx), RXl3; | ||
272 | vmovdqu 7*32(%rdx), RXr3; | ||
273 | |||
274 | call __blowfish_enc_blk32; | ||
275 | |||
276 | vmovdqu RXr0, 0*32(%rsi); | ||
277 | vmovdqu RXl0, 1*32(%rsi); | ||
278 | vmovdqu RXr1, 2*32(%rsi); | ||
279 | vmovdqu RXl1, 3*32(%rsi); | ||
280 | vmovdqu RXr2, 4*32(%rsi); | ||
281 | vmovdqu RXl2, 5*32(%rsi); | ||
282 | vmovdqu RXr3, 6*32(%rsi); | ||
283 | vmovdqu RXl3, 7*32(%rsi); | ||
284 | |||
285 | vzeroupper; | ||
286 | |||
287 | ret; | ||
288 | ENDPROC(blowfish_ecb_enc_32way) | ||
289 | |||
290 | ENTRY(blowfish_ecb_dec_32way) | ||
291 | /* input: | ||
292 | * %rdi: ctx, CTX | ||
293 | * %rsi: dst | ||
294 | * %rdx: src | ||
295 | */ | ||
296 | |||
297 | vzeroupper; | ||
298 | |||
299 | vmovdqu 0*32(%rdx), RXl0; | ||
300 | vmovdqu 1*32(%rdx), RXr0; | ||
301 | vmovdqu 2*32(%rdx), RXl1; | ||
302 | vmovdqu 3*32(%rdx), RXr1; | ||
303 | vmovdqu 4*32(%rdx), RXl2; | ||
304 | vmovdqu 5*32(%rdx), RXr2; | ||
305 | vmovdqu 6*32(%rdx), RXl3; | ||
306 | vmovdqu 7*32(%rdx), RXr3; | ||
307 | |||
308 | call __blowfish_dec_blk32; | ||
309 | |||
310 | vmovdqu RXr0, 0*32(%rsi); | ||
311 | vmovdqu RXl0, 1*32(%rsi); | ||
312 | vmovdqu RXr1, 2*32(%rsi); | ||
313 | vmovdqu RXl1, 3*32(%rsi); | ||
314 | vmovdqu RXr2, 4*32(%rsi); | ||
315 | vmovdqu RXl2, 5*32(%rsi); | ||
316 | vmovdqu RXr3, 6*32(%rsi); | ||
317 | vmovdqu RXl3, 7*32(%rsi); | ||
318 | |||
319 | vzeroupper; | ||
320 | |||
321 | ret; | ||
322 | ENDPROC(blowfish_ecb_dec_32way) | ||
323 | |||
324 | ENTRY(blowfish_cbc_dec_32way) | ||
325 | /* input: | ||
326 | * %rdi: ctx, CTX | ||
327 | * %rsi: dst | ||
328 | * %rdx: src | ||
329 | */ | ||
330 | |||
331 | vzeroupper; | ||
332 | |||
333 | vmovdqu 0*32(%rdx), RXl0; | ||
334 | vmovdqu 1*32(%rdx), RXr0; | ||
335 | vmovdqu 2*32(%rdx), RXl1; | ||
336 | vmovdqu 3*32(%rdx), RXr1; | ||
337 | vmovdqu 4*32(%rdx), RXl2; | ||
338 | vmovdqu 5*32(%rdx), RXr2; | ||
339 | vmovdqu 6*32(%rdx), RXl3; | ||
340 | vmovdqu 7*32(%rdx), RXr3; | ||
341 | |||
342 | call __blowfish_dec_blk32; | ||
343 | |||
344 | /* xor with src */ | ||
345 | vmovq (%rdx), RT0x; | ||
346 | vpshufd $0x4f, RT0x, RT0x; | ||
347 | vinserti128 $1, 8(%rdx), RT0, RT0; | ||
348 | vpxor RT0, RXr0, RXr0; | ||
349 | vpxor 0*32+24(%rdx), RXl0, RXl0; | ||
350 | vpxor 1*32+24(%rdx), RXr1, RXr1; | ||
351 | vpxor 2*32+24(%rdx), RXl1, RXl1; | ||
352 | vpxor 3*32+24(%rdx), RXr2, RXr2; | ||
353 | vpxor 4*32+24(%rdx), RXl2, RXl2; | ||
354 | vpxor 5*32+24(%rdx), RXr3, RXr3; | ||
355 | vpxor 6*32+24(%rdx), RXl3, RXl3; | ||
356 | |||
357 | vmovdqu RXr0, (0*32)(%rsi); | ||
358 | vmovdqu RXl0, (1*32)(%rsi); | ||
359 | vmovdqu RXr1, (2*32)(%rsi); | ||
360 | vmovdqu RXl1, (3*32)(%rsi); | ||
361 | vmovdqu RXr2, (4*32)(%rsi); | ||
362 | vmovdqu RXl2, (5*32)(%rsi); | ||
363 | vmovdqu RXr3, (6*32)(%rsi); | ||
364 | vmovdqu RXl3, (7*32)(%rsi); | ||
365 | |||
366 | vzeroupper; | ||
367 | |||
368 | ret; | ||
369 | ENDPROC(blowfish_cbc_dec_32way) | ||
370 | |||
371 | ENTRY(blowfish_ctr_32way) | ||
372 | /* input: | ||
373 | * %rdi: ctx, CTX | ||
374 | * %rsi: dst | ||
375 | * %rdx: src | ||
376 | * %rcx: iv (big endian, 64bit) | ||
377 | */ | ||
378 | |||
379 | vzeroupper; | ||
380 | |||
381 | vpcmpeqd RT0, RT0, RT0; | ||
382 | vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ | ||
383 | |||
384 | vpcmpeqd RT1x, RT1x, RT1x; | ||
385 | vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ | ||
386 | vpxor RIDX0, RIDX0, RIDX0; | ||
387 | vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ | ||
388 | |||
389 | vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ | ||
390 | |||
391 | vpcmpeqd RT1, RT1, RT1; | ||
392 | vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ | ||
393 | vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ | ||
394 | |||
395 | vbroadcasti128 .Lbswap_iv_mask, RIDX0; | ||
396 | vbroadcasti128 .Lbswap128_mask, RIDX1; | ||
397 | |||
398 | /* load IV and byteswap */ | ||
399 | vmovq (%rcx), RT1x; | ||
400 | vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ | ||
401 | vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ | ||
402 | |||
403 | /* construct IVs */ | ||
404 | vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ | ||
405 | vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ | ||
406 | vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ | ||
407 | vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ | ||
408 | vpsubq RIDX2, RT1, RT1; | ||
409 | vpshufb RIDX1, RT1, RXl1; | ||
410 | vpsubq RIDX2, RT1, RT1; | ||
411 | vpshufb RIDX1, RT1, RXr1; | ||
412 | vpsubq RIDX2, RT1, RT1; | ||
413 | vpshufb RIDX1, RT1, RXl2; | ||
414 | vpsubq RIDX2, RT1, RT1; | ||
415 | vpshufb RIDX1, RT1, RXr2; | ||
416 | vpsubq RIDX2, RT1, RT1; | ||
417 | vpshufb RIDX1, RT1, RXl3; | ||
418 | vpsubq RIDX2, RT1, RT1; | ||
419 | vpshufb RIDX1, RT1, RXr3; | ||
420 | |||
421 | /* store last IV */ | ||
422 | vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ | ||
423 | vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ | ||
424 | vmovq RT1x, (%rcx); | ||
425 | |||
426 | call __blowfish_enc_blk32; | ||
427 | |||
428 | /* dst = src ^ iv */ | ||
429 | vpxor 0*32(%rdx), RXr0, RXr0; | ||
430 | vpxor 1*32(%rdx), RXl0, RXl0; | ||
431 | vpxor 2*32(%rdx), RXr1, RXr1; | ||
432 | vpxor 3*32(%rdx), RXl1, RXl1; | ||
433 | vpxor 4*32(%rdx), RXr2, RXr2; | ||
434 | vpxor 5*32(%rdx), RXl2, RXl2; | ||
435 | vpxor 6*32(%rdx), RXr3, RXr3; | ||
436 | vpxor 7*32(%rdx), RXl3, RXl3; | ||
437 | vmovdqu RXr0, (0*32)(%rsi); | ||
438 | vmovdqu RXl0, (1*32)(%rsi); | ||
439 | vmovdqu RXr1, (2*32)(%rsi); | ||
440 | vmovdqu RXl1, (3*32)(%rsi); | ||
441 | vmovdqu RXr2, (4*32)(%rsi); | ||
442 | vmovdqu RXl2, (5*32)(%rsi); | ||
443 | vmovdqu RXr3, (6*32)(%rsi); | ||
444 | vmovdqu RXl3, (7*32)(%rsi); | ||
445 | |||
446 | vzeroupper; | ||
447 | |||
448 | ret; | ||
449 | ENDPROC(blowfish_ctr_32way) | ||
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c new file mode 100644 index 000000000000..4417e9aea78d --- /dev/null +++ b/arch/x86/crypto/blowfish_avx2_glue.c | |||
@@ -0,0 +1,585 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | ||
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
8 | * CTR part based on code (crypto/ctr.c) by: | ||
9 | * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/crypto.h> | ||
26 | #include <linux/err.h> | ||
27 | #include <crypto/algapi.h> | ||
28 | #include <crypto/blowfish.h> | ||
29 | #include <crypto/cryptd.h> | ||
30 | #include <crypto/ctr.h> | ||
31 | #include <asm/i387.h> | ||
32 | #include <asm/xcr.h> | ||
33 | #include <asm/xsave.h> | ||
34 | #include <asm/crypto/blowfish.h> | ||
35 | #include <asm/crypto/ablk_helper.h> | ||
36 | #include <crypto/scatterwalk.h> | ||
37 | |||
38 | #define BF_AVX2_PARALLEL_BLOCKS 32 | ||
39 | |||
40 | /* 32-way AVX2 parallel cipher functions */ | ||
41 | asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, | ||
42 | const u8 *src); | ||
43 | asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, | ||
44 | const u8 *src); | ||
45 | asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, | ||
46 | const u8 *src); | ||
47 | asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, | ||
48 | __be64 *iv); | ||
49 | |||
50 | static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
51 | { | ||
52 | if (fpu_enabled) | ||
53 | return true; | ||
54 | |||
55 | /* FPU is only used when chunk to be processed is large enough, so | ||
56 | * do not enable FPU until it is necessary. | ||
57 | */ | ||
58 | if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) | ||
59 | return false; | ||
60 | |||
61 | kernel_fpu_begin(); | ||
62 | return true; | ||
63 | } | ||
64 | |||
65 | static inline void bf_fpu_end(bool fpu_enabled) | ||
66 | { | ||
67 | if (fpu_enabled) | ||
68 | kernel_fpu_end(); | ||
69 | } | ||
70 | |||
71 | static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | ||
72 | bool enc) | ||
73 | { | ||
74 | bool fpu_enabled = false; | ||
75 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
76 | const unsigned int bsize = BF_BLOCK_SIZE; | ||
77 | unsigned int nbytes; | ||
78 | int err; | ||
79 | |||
80 | err = blkcipher_walk_virt(desc, walk); | ||
81 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
82 | |||
83 | while ((nbytes = walk->nbytes)) { | ||
84 | u8 *wsrc = walk->src.virt.addr; | ||
85 | u8 *wdst = walk->dst.virt.addr; | ||
86 | |||
87 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
88 | |||
89 | /* Process multi-block AVX2 batch */ | ||
90 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
91 | do { | ||
92 | if (enc) | ||
93 | blowfish_ecb_enc_32way(ctx, wdst, wsrc); | ||
94 | else | ||
95 | blowfish_ecb_dec_32way(ctx, wdst, wsrc); | ||
96 | |||
97 | wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
98 | wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
99 | nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
100 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
101 | |||
102 | if (nbytes < bsize) | ||
103 | goto done; | ||
104 | } | ||
105 | |||
106 | /* Process multi-block batch */ | ||
107 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
108 | do { | ||
109 | if (enc) | ||
110 | blowfish_enc_blk_4way(ctx, wdst, wsrc); | ||
111 | else | ||
112 | blowfish_dec_blk_4way(ctx, wdst, wsrc); | ||
113 | |||
114 | wsrc += bsize * BF_PARALLEL_BLOCKS; | ||
115 | wdst += bsize * BF_PARALLEL_BLOCKS; | ||
116 | nbytes -= bsize * BF_PARALLEL_BLOCKS; | ||
117 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
118 | |||
119 | if (nbytes < bsize) | ||
120 | goto done; | ||
121 | } | ||
122 | |||
123 | /* Handle leftovers */ | ||
124 | do { | ||
125 | if (enc) | ||
126 | blowfish_enc_blk(ctx, wdst, wsrc); | ||
127 | else | ||
128 | blowfish_dec_blk(ctx, wdst, wsrc); | ||
129 | |||
130 | wsrc += bsize; | ||
131 | wdst += bsize; | ||
132 | nbytes -= bsize; | ||
133 | } while (nbytes >= bsize); | ||
134 | |||
135 | done: | ||
136 | err = blkcipher_walk_done(desc, walk, nbytes); | ||
137 | } | ||
138 | |||
139 | bf_fpu_end(fpu_enabled); | ||
140 | return err; | ||
141 | } | ||
142 | |||
143 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
144 | struct scatterlist *src, unsigned int nbytes) | ||
145 | { | ||
146 | struct blkcipher_walk walk; | ||
147 | |||
148 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
149 | return ecb_crypt(desc, &walk, true); | ||
150 | } | ||
151 | |||
152 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
153 | struct scatterlist *src, unsigned int nbytes) | ||
154 | { | ||
155 | struct blkcipher_walk walk; | ||
156 | |||
157 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
158 | return ecb_crypt(desc, &walk, false); | ||
159 | } | ||
160 | |||
161 | static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, | ||
162 | struct blkcipher_walk *walk) | ||
163 | { | ||
164 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
165 | unsigned int bsize = BF_BLOCK_SIZE; | ||
166 | unsigned int nbytes = walk->nbytes; | ||
167 | u64 *src = (u64 *)walk->src.virt.addr; | ||
168 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
169 | u64 *iv = (u64 *)walk->iv; | ||
170 | |||
171 | do { | ||
172 | *dst = *src ^ *iv; | ||
173 | blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); | ||
174 | iv = dst; | ||
175 | |||
176 | src += 1; | ||
177 | dst += 1; | ||
178 | nbytes -= bsize; | ||
179 | } while (nbytes >= bsize); | ||
180 | |||
181 | *(u64 *)walk->iv = *iv; | ||
182 | return nbytes; | ||
183 | } | ||
184 | |||
185 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
186 | struct scatterlist *src, unsigned int nbytes) | ||
187 | { | ||
188 | struct blkcipher_walk walk; | ||
189 | int err; | ||
190 | |||
191 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
192 | err = blkcipher_walk_virt(desc, &walk); | ||
193 | |||
194 | while ((nbytes = walk.nbytes)) { | ||
195 | nbytes = __cbc_encrypt(desc, &walk); | ||
196 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
197 | } | ||
198 | |||
199 | return err; | ||
200 | } | ||
201 | |||
202 | static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | ||
203 | struct blkcipher_walk *walk) | ||
204 | { | ||
205 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
206 | const unsigned int bsize = BF_BLOCK_SIZE; | ||
207 | unsigned int nbytes = walk->nbytes; | ||
208 | u64 *src = (u64 *)walk->src.virt.addr; | ||
209 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
210 | u64 last_iv; | ||
211 | int i; | ||
212 | |||
213 | /* Start of the last block. */ | ||
214 | src += nbytes / bsize - 1; | ||
215 | dst += nbytes / bsize - 1; | ||
216 | |||
217 | last_iv = *src; | ||
218 | |||
219 | /* Process multi-block AVX2 batch */ | ||
220 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
221 | do { | ||
222 | nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); | ||
223 | src -= BF_AVX2_PARALLEL_BLOCKS - 1; | ||
224 | dst -= BF_AVX2_PARALLEL_BLOCKS - 1; | ||
225 | |||
226 | blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); | ||
227 | |||
228 | nbytes -= bsize; | ||
229 | if (nbytes < bsize) | ||
230 | goto done; | ||
231 | |||
232 | *dst ^= *(src - 1); | ||
233 | src -= 1; | ||
234 | dst -= 1; | ||
235 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
236 | |||
237 | if (nbytes < bsize) | ||
238 | goto done; | ||
239 | } | ||
240 | |||
241 | /* Process multi-block batch */ | ||
242 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
243 | u64 ivs[BF_PARALLEL_BLOCKS - 1]; | ||
244 | |||
245 | do { | ||
246 | nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); | ||
247 | src -= BF_PARALLEL_BLOCKS - 1; | ||
248 | dst -= BF_PARALLEL_BLOCKS - 1; | ||
249 | |||
250 | for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) | ||
251 | ivs[i] = src[i]; | ||
252 | |||
253 | blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); | ||
254 | |||
255 | for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) | ||
256 | dst[i + 1] ^= ivs[i]; | ||
257 | |||
258 | nbytes -= bsize; | ||
259 | if (nbytes < bsize) | ||
260 | goto done; | ||
261 | |||
262 | *dst ^= *(src - 1); | ||
263 | src -= 1; | ||
264 | dst -= 1; | ||
265 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
266 | |||
267 | if (nbytes < bsize) | ||
268 | goto done; | ||
269 | } | ||
270 | |||
271 | /* Handle leftovers */ | ||
272 | for (;;) { | ||
273 | blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); | ||
274 | |||
275 | nbytes -= bsize; | ||
276 | if (nbytes < bsize) | ||
277 | break; | ||
278 | |||
279 | *dst ^= *(src - 1); | ||
280 | src -= 1; | ||
281 | dst -= 1; | ||
282 | } | ||
283 | |||
284 | done: | ||
285 | *dst ^= *(u64 *)walk->iv; | ||
286 | *(u64 *)walk->iv = last_iv; | ||
287 | |||
288 | return nbytes; | ||
289 | } | ||
290 | |||
291 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
292 | struct scatterlist *src, unsigned int nbytes) | ||
293 | { | ||
294 | bool fpu_enabled = false; | ||
295 | struct blkcipher_walk walk; | ||
296 | int err; | ||
297 | |||
298 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
299 | err = blkcipher_walk_virt(desc, &walk); | ||
300 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
301 | |||
302 | while ((nbytes = walk.nbytes)) { | ||
303 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
304 | nbytes = __cbc_decrypt(desc, &walk); | ||
305 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
306 | } | ||
307 | |||
308 | bf_fpu_end(fpu_enabled); | ||
309 | return err; | ||
310 | } | ||
311 | |||
312 | static void ctr_crypt_final(struct blkcipher_desc *desc, | ||
313 | struct blkcipher_walk *walk) | ||
314 | { | ||
315 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
316 | u8 *ctrblk = walk->iv; | ||
317 | u8 keystream[BF_BLOCK_SIZE]; | ||
318 | u8 *src = walk->src.virt.addr; | ||
319 | u8 *dst = walk->dst.virt.addr; | ||
320 | unsigned int nbytes = walk->nbytes; | ||
321 | |||
322 | blowfish_enc_blk(ctx, keystream, ctrblk); | ||
323 | crypto_xor(keystream, src, nbytes); | ||
324 | memcpy(dst, keystream, nbytes); | ||
325 | |||
326 | crypto_inc(ctrblk, BF_BLOCK_SIZE); | ||
327 | } | ||
328 | |||
329 | static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | ||
330 | struct blkcipher_walk *walk) | ||
331 | { | ||
332 | struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
333 | unsigned int bsize = BF_BLOCK_SIZE; | ||
334 | unsigned int nbytes = walk->nbytes; | ||
335 | u64 *src = (u64 *)walk->src.virt.addr; | ||
336 | u64 *dst = (u64 *)walk->dst.virt.addr; | ||
337 | int i; | ||
338 | |||
339 | /* Process multi-block AVX2 batch */ | ||
340 | if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { | ||
341 | do { | ||
342 | blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, | ||
343 | (__be64 *)walk->iv); | ||
344 | |||
345 | src += BF_AVX2_PARALLEL_BLOCKS; | ||
346 | dst += BF_AVX2_PARALLEL_BLOCKS; | ||
347 | nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; | ||
348 | } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); | ||
349 | |||
350 | if (nbytes < bsize) | ||
351 | goto done; | ||
352 | } | ||
353 | |||
354 | /* Process four block batch */ | ||
355 | if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { | ||
356 | __be64 ctrblocks[BF_PARALLEL_BLOCKS]; | ||
357 | u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); | ||
358 | |||
359 | do { | ||
360 | /* create ctrblks for parallel encrypt */ | ||
361 | for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { | ||
362 | if (dst != src) | ||
363 | dst[i] = src[i]; | ||
364 | |||
365 | ctrblocks[i] = cpu_to_be64(ctrblk++); | ||
366 | } | ||
367 | |||
368 | blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, | ||
369 | (u8 *)ctrblocks); | ||
370 | |||
371 | src += BF_PARALLEL_BLOCKS; | ||
372 | dst += BF_PARALLEL_BLOCKS; | ||
373 | nbytes -= bsize * BF_PARALLEL_BLOCKS; | ||
374 | } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); | ||
375 | |||
376 | *(__be64 *)walk->iv = cpu_to_be64(ctrblk); | ||
377 | |||
378 | if (nbytes < bsize) | ||
379 | goto done; | ||
380 | } | ||
381 | |||
382 | /* Handle leftovers */ | ||
383 | do { | ||
384 | u64 ctrblk; | ||
385 | |||
386 | if (dst != src) | ||
387 | *dst = *src; | ||
388 | |||
389 | ctrblk = *(u64 *)walk->iv; | ||
390 | be64_add_cpu((__be64 *)walk->iv, 1); | ||
391 | |||
392 | blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); | ||
393 | |||
394 | src += 1; | ||
395 | dst += 1; | ||
396 | } while ((nbytes -= bsize) >= bsize); | ||
397 | |||
398 | done: | ||
399 | return nbytes; | ||
400 | } | ||
401 | |||
402 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
403 | struct scatterlist *src, unsigned int nbytes) | ||
404 | { | ||
405 | bool fpu_enabled = false; | ||
406 | struct blkcipher_walk walk; | ||
407 | int err; | ||
408 | |||
409 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
410 | err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); | ||
411 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
412 | |||
413 | while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { | ||
414 | fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); | ||
415 | nbytes = __ctr_crypt(desc, &walk); | ||
416 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
417 | } | ||
418 | |||
419 | bf_fpu_end(fpu_enabled); | ||
420 | |||
421 | if (walk.nbytes) { | ||
422 | ctr_crypt_final(desc, &walk); | ||
423 | err = blkcipher_walk_done(desc, &walk, 0); | ||
424 | } | ||
425 | |||
426 | return err; | ||
427 | } | ||
428 | |||
429 | static struct crypto_alg bf_algs[6] = { { | ||
430 | .cra_name = "__ecb-blowfish-avx2", | ||
431 | .cra_driver_name = "__driver-ecb-blowfish-avx2", | ||
432 | .cra_priority = 0, | ||
433 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
434 | .cra_blocksize = BF_BLOCK_SIZE, | ||
435 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
436 | .cra_alignmask = 0, | ||
437 | .cra_type = &crypto_blkcipher_type, | ||
438 | .cra_module = THIS_MODULE, | ||
439 | .cra_u = { | ||
440 | .blkcipher = { | ||
441 | .min_keysize = BF_MIN_KEY_SIZE, | ||
442 | .max_keysize = BF_MAX_KEY_SIZE, | ||
443 | .setkey = blowfish_setkey, | ||
444 | .encrypt = ecb_encrypt, | ||
445 | .decrypt = ecb_decrypt, | ||
446 | }, | ||
447 | }, | ||
448 | }, { | ||
449 | .cra_name = "__cbc-blowfish-avx2", | ||
450 | .cra_driver_name = "__driver-cbc-blowfish-avx2", | ||
451 | .cra_priority = 0, | ||
452 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
453 | .cra_blocksize = BF_BLOCK_SIZE, | ||
454 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
455 | .cra_alignmask = 0, | ||
456 | .cra_type = &crypto_blkcipher_type, | ||
457 | .cra_module = THIS_MODULE, | ||
458 | .cra_u = { | ||
459 | .blkcipher = { | ||
460 | .min_keysize = BF_MIN_KEY_SIZE, | ||
461 | .max_keysize = BF_MAX_KEY_SIZE, | ||
462 | .setkey = blowfish_setkey, | ||
463 | .encrypt = cbc_encrypt, | ||
464 | .decrypt = cbc_decrypt, | ||
465 | }, | ||
466 | }, | ||
467 | }, { | ||
468 | .cra_name = "__ctr-blowfish-avx2", | ||
469 | .cra_driver_name = "__driver-ctr-blowfish-avx2", | ||
470 | .cra_priority = 0, | ||
471 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
472 | .cra_blocksize = 1, | ||
473 | .cra_ctxsize = sizeof(struct bf_ctx), | ||
474 | .cra_alignmask = 0, | ||
475 | .cra_type = &crypto_blkcipher_type, | ||
476 | .cra_module = THIS_MODULE, | ||
477 | .cra_u = { | ||
478 | .blkcipher = { | ||
479 | .min_keysize = BF_MIN_KEY_SIZE, | ||
480 | .max_keysize = BF_MAX_KEY_SIZE, | ||
481 | .ivsize = BF_BLOCK_SIZE, | ||
482 | .setkey = blowfish_setkey, | ||
483 | .encrypt = ctr_crypt, | ||
484 | .decrypt = ctr_crypt, | ||
485 | }, | ||
486 | }, | ||
487 | }, { | ||
488 | .cra_name = "ecb(blowfish)", | ||
489 | .cra_driver_name = "ecb-blowfish-avx2", | ||
490 | .cra_priority = 400, | ||
491 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
492 | .cra_blocksize = BF_BLOCK_SIZE, | ||
493 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
494 | .cra_alignmask = 0, | ||
495 | .cra_type = &crypto_ablkcipher_type, | ||
496 | .cra_module = THIS_MODULE, | ||
497 | .cra_init = ablk_init, | ||
498 | .cra_exit = ablk_exit, | ||
499 | .cra_u = { | ||
500 | .ablkcipher = { | ||
501 | .min_keysize = BF_MIN_KEY_SIZE, | ||
502 | .max_keysize = BF_MAX_KEY_SIZE, | ||
503 | .setkey = ablk_set_key, | ||
504 | .encrypt = ablk_encrypt, | ||
505 | .decrypt = ablk_decrypt, | ||
506 | }, | ||
507 | }, | ||
508 | }, { | ||
509 | .cra_name = "cbc(blowfish)", | ||
510 | .cra_driver_name = "cbc-blowfish-avx2", | ||
511 | .cra_priority = 400, | ||
512 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
513 | .cra_blocksize = BF_BLOCK_SIZE, | ||
514 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
515 | .cra_alignmask = 0, | ||
516 | .cra_type = &crypto_ablkcipher_type, | ||
517 | .cra_module = THIS_MODULE, | ||
518 | .cra_init = ablk_init, | ||
519 | .cra_exit = ablk_exit, | ||
520 | .cra_u = { | ||
521 | .ablkcipher = { | ||
522 | .min_keysize = BF_MIN_KEY_SIZE, | ||
523 | .max_keysize = BF_MAX_KEY_SIZE, | ||
524 | .ivsize = BF_BLOCK_SIZE, | ||
525 | .setkey = ablk_set_key, | ||
526 | .encrypt = __ablk_encrypt, | ||
527 | .decrypt = ablk_decrypt, | ||
528 | }, | ||
529 | }, | ||
530 | }, { | ||
531 | .cra_name = "ctr(blowfish)", | ||
532 | .cra_driver_name = "ctr-blowfish-avx2", | ||
533 | .cra_priority = 400, | ||
534 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
535 | .cra_blocksize = 1, | ||
536 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
537 | .cra_alignmask = 0, | ||
538 | .cra_type = &crypto_ablkcipher_type, | ||
539 | .cra_module = THIS_MODULE, | ||
540 | .cra_init = ablk_init, | ||
541 | .cra_exit = ablk_exit, | ||
542 | .cra_u = { | ||
543 | .ablkcipher = { | ||
544 | .min_keysize = BF_MIN_KEY_SIZE, | ||
545 | .max_keysize = BF_MAX_KEY_SIZE, | ||
546 | .ivsize = BF_BLOCK_SIZE, | ||
547 | .setkey = ablk_set_key, | ||
548 | .encrypt = ablk_encrypt, | ||
549 | .decrypt = ablk_encrypt, | ||
550 | .geniv = "chainiv", | ||
551 | }, | ||
552 | }, | ||
553 | } }; | ||
554 | |||
555 | |||
556 | static int __init init(void) | ||
557 | { | ||
558 | u64 xcr0; | ||
559 | |||
560 | if (!cpu_has_avx2 || !cpu_has_osxsave) { | ||
561 | pr_info("AVX2 instructions are not detected.\n"); | ||
562 | return -ENODEV; | ||
563 | } | ||
564 | |||
565 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
566 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
567 | pr_info("AVX detected but unusable.\n"); | ||
568 | return -ENODEV; | ||
569 | } | ||
570 | |||
571 | return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); | ||
572 | } | ||
573 | |||
574 | static void __exit fini(void) | ||
575 | { | ||
576 | crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); | ||
577 | } | ||
578 | |||
579 | module_init(init); | ||
580 | module_exit(fini); | ||
581 | |||
582 | MODULE_LICENSE("GPL"); | ||
583 | MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); | ||
584 | MODULE_ALIAS("blowfish"); | ||
585 | MODULE_ALIAS("blowfish-asm"); | ||
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333b70e6..3548d76dbaa9 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Glue Code for assembler optimized version of Blowfish | 2 | * Glue Code for assembler optimized version of Blowfish |
3 | * | 3 | * |
4 | * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | 6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: |
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | 7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> |
@@ -32,40 +32,24 @@ | |||
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/types.h> | 33 | #include <linux/types.h> |
34 | #include <crypto/algapi.h> | 34 | #include <crypto/algapi.h> |
35 | #include <asm/crypto/blowfish.h> | ||
35 | 36 | ||
36 | /* regular block cipher functions */ | 37 | /* regular block cipher functions */ |
37 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, | 38 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, |
38 | bool xor); | 39 | bool xor); |
40 | EXPORT_SYMBOL_GPL(__blowfish_enc_blk); | ||
41 | |||
39 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); | 42 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); |
43 | EXPORT_SYMBOL_GPL(blowfish_dec_blk); | ||
40 | 44 | ||
41 | /* 4-way parallel cipher functions */ | 45 | /* 4-way parallel cipher functions */ |
42 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | 46 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, |
43 | const u8 *src, bool xor); | 47 | const u8 *src, bool xor); |
48 | EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); | ||
49 | |||
44 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, | 50 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, |
45 | const u8 *src); | 51 | const u8 *src); |
46 | 52 | EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); | |
47 | static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) | ||
48 | { | ||
49 | __blowfish_enc_blk(ctx, dst, src, false); | ||
50 | } | ||
51 | |||
52 | static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, | ||
53 | const u8 *src) | ||
54 | { | ||
55 | __blowfish_enc_blk(ctx, dst, src, true); | ||
56 | } | ||
57 | |||
58 | static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | ||
59 | const u8 *src) | ||
60 | { | ||
61 | __blowfish_enc_blk_4way(ctx, dst, src, false); | ||
62 | } | ||
63 | |||
64 | static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, | ||
65 | const u8 *src) | ||
66 | { | ||
67 | __blowfish_enc_blk_4way(ctx, dst, src, true); | ||
68 | } | ||
69 | 53 | ||
70 | static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | 54 | static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) |
71 | { | 55 | { |
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index cfc163469c71..ce71f9212409 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * x86_64/AVX/AES-NI assembler implementation of Camellia | 2 | * x86_64/AVX/AES-NI assembler implementation of Camellia |
3 | * | 3 | * |
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | |||
589 | .Lbswap128_mask: | 589 | .Lbswap128_mask: |
590 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 590 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
591 | 591 | ||
592 | /* For XTS mode IV generation */ | ||
593 | .Lxts_gf128mul_and_shl1_mask: | ||
594 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
595 | |||
592 | /* | 596 | /* |
593 | * pre-SubByte transform | 597 | * pre-SubByte transform |
594 | * | 598 | * |
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way) | |||
1090 | 1094 | ||
1091 | ret; | 1095 | ret; |
1092 | ENDPROC(camellia_ctr_16way) | 1096 | ENDPROC(camellia_ctr_16way) |
1097 | |||
1098 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
1099 | vpsrad $31, iv, tmp; \ | ||
1100 | vpaddq iv, iv, iv; \ | ||
1101 | vpshufd $0x13, tmp, tmp; \ | ||
1102 | vpand mask, tmp, tmp; \ | ||
1103 | vpxor tmp, iv, iv; | ||
1104 | |||
1105 | .align 8 | ||
1106 | camellia_xts_crypt_16way: | ||
1107 | /* input: | ||
1108 | * %rdi: ctx, CTX | ||
1109 | * %rsi: dst (16 blocks) | ||
1110 | * %rdx: src (16 blocks) | ||
1111 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1112 | * %r8: index for input whitening key | ||
1113 | * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 | ||
1114 | */ | ||
1115 | |||
1116 | subq $(16 * 16), %rsp; | ||
1117 | movq %rsp, %rax; | ||
1118 | |||
1119 | vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; | ||
1120 | |||
1121 | /* load IV */ | ||
1122 | vmovdqu (%rcx), %xmm0; | ||
1123 | vpxor 0 * 16(%rdx), %xmm0, %xmm15; | ||
1124 | vmovdqu %xmm15, 15 * 16(%rax); | ||
1125 | vmovdqu %xmm0, 0 * 16(%rsi); | ||
1126 | |||
1127 | /* construct IVs */ | ||
1128 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1129 | vpxor 1 * 16(%rdx), %xmm0, %xmm15; | ||
1130 | vmovdqu %xmm15, 14 * 16(%rax); | ||
1131 | vmovdqu %xmm0, 1 * 16(%rsi); | ||
1132 | |||
1133 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1134 | vpxor 2 * 16(%rdx), %xmm0, %xmm13; | ||
1135 | vmovdqu %xmm0, 2 * 16(%rsi); | ||
1136 | |||
1137 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1138 | vpxor 3 * 16(%rdx), %xmm0, %xmm12; | ||
1139 | vmovdqu %xmm0, 3 * 16(%rsi); | ||
1140 | |||
1141 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1142 | vpxor 4 * 16(%rdx), %xmm0, %xmm11; | ||
1143 | vmovdqu %xmm0, 4 * 16(%rsi); | ||
1144 | |||
1145 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1146 | vpxor 5 * 16(%rdx), %xmm0, %xmm10; | ||
1147 | vmovdqu %xmm0, 5 * 16(%rsi); | ||
1148 | |||
1149 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1150 | vpxor 6 * 16(%rdx), %xmm0, %xmm9; | ||
1151 | vmovdqu %xmm0, 6 * 16(%rsi); | ||
1152 | |||
1153 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1154 | vpxor 7 * 16(%rdx), %xmm0, %xmm8; | ||
1155 | vmovdqu %xmm0, 7 * 16(%rsi); | ||
1156 | |||
1157 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1158 | vpxor 8 * 16(%rdx), %xmm0, %xmm7; | ||
1159 | vmovdqu %xmm0, 8 * 16(%rsi); | ||
1160 | |||
1161 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1162 | vpxor 9 * 16(%rdx), %xmm0, %xmm6; | ||
1163 | vmovdqu %xmm0, 9 * 16(%rsi); | ||
1164 | |||
1165 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1166 | vpxor 10 * 16(%rdx), %xmm0, %xmm5; | ||
1167 | vmovdqu %xmm0, 10 * 16(%rsi); | ||
1168 | |||
1169 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1170 | vpxor 11 * 16(%rdx), %xmm0, %xmm4; | ||
1171 | vmovdqu %xmm0, 11 * 16(%rsi); | ||
1172 | |||
1173 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1174 | vpxor 12 * 16(%rdx), %xmm0, %xmm3; | ||
1175 | vmovdqu %xmm0, 12 * 16(%rsi); | ||
1176 | |||
1177 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1178 | vpxor 13 * 16(%rdx), %xmm0, %xmm2; | ||
1179 | vmovdqu %xmm0, 13 * 16(%rsi); | ||
1180 | |||
1181 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1182 | vpxor 14 * 16(%rdx), %xmm0, %xmm1; | ||
1183 | vmovdqu %xmm0, 14 * 16(%rsi); | ||
1184 | |||
1185 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1186 | vpxor 15 * 16(%rdx), %xmm0, %xmm15; | ||
1187 | vmovdqu %xmm15, 0 * 16(%rax); | ||
1188 | vmovdqu %xmm0, 15 * 16(%rsi); | ||
1189 | |||
1190 | gf128mul_x_ble(%xmm0, %xmm14, %xmm15); | ||
1191 | vmovdqu %xmm0, (%rcx); | ||
1192 | |||
1193 | /* inpack16_pre: */ | ||
1194 | vmovq (key_table)(CTX, %r8, 8), %xmm15; | ||
1195 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | ||
1196 | vpxor 0 * 16(%rax), %xmm15, %xmm0; | ||
1197 | vpxor %xmm1, %xmm15, %xmm1; | ||
1198 | vpxor %xmm2, %xmm15, %xmm2; | ||
1199 | vpxor %xmm3, %xmm15, %xmm3; | ||
1200 | vpxor %xmm4, %xmm15, %xmm4; | ||
1201 | vpxor %xmm5, %xmm15, %xmm5; | ||
1202 | vpxor %xmm6, %xmm15, %xmm6; | ||
1203 | vpxor %xmm7, %xmm15, %xmm7; | ||
1204 | vpxor %xmm8, %xmm15, %xmm8; | ||
1205 | vpxor %xmm9, %xmm15, %xmm9; | ||
1206 | vpxor %xmm10, %xmm15, %xmm10; | ||
1207 | vpxor %xmm11, %xmm15, %xmm11; | ||
1208 | vpxor %xmm12, %xmm15, %xmm12; | ||
1209 | vpxor %xmm13, %xmm15, %xmm13; | ||
1210 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | ||
1211 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | ||
1212 | |||
1213 | call *%r9; | ||
1214 | |||
1215 | addq $(16 * 16), %rsp; | ||
1216 | |||
1217 | vpxor 0 * 16(%rsi), %xmm7, %xmm7; | ||
1218 | vpxor 1 * 16(%rsi), %xmm6, %xmm6; | ||
1219 | vpxor 2 * 16(%rsi), %xmm5, %xmm5; | ||
1220 | vpxor 3 * 16(%rsi), %xmm4, %xmm4; | ||
1221 | vpxor 4 * 16(%rsi), %xmm3, %xmm3; | ||
1222 | vpxor 5 * 16(%rsi), %xmm2, %xmm2; | ||
1223 | vpxor 6 * 16(%rsi), %xmm1, %xmm1; | ||
1224 | vpxor 7 * 16(%rsi), %xmm0, %xmm0; | ||
1225 | vpxor 8 * 16(%rsi), %xmm15, %xmm15; | ||
1226 | vpxor 9 * 16(%rsi), %xmm14, %xmm14; | ||
1227 | vpxor 10 * 16(%rsi), %xmm13, %xmm13; | ||
1228 | vpxor 11 * 16(%rsi), %xmm12, %xmm12; | ||
1229 | vpxor 12 * 16(%rsi), %xmm11, %xmm11; | ||
1230 | vpxor 13 * 16(%rsi), %xmm10, %xmm10; | ||
1231 | vpxor 14 * 16(%rsi), %xmm9, %xmm9; | ||
1232 | vpxor 15 * 16(%rsi), %xmm8, %xmm8; | ||
1233 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
1234 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
1235 | %xmm8, %rsi); | ||
1236 | |||
1237 | ret; | ||
1238 | ENDPROC(camellia_xts_crypt_16way) | ||
1239 | |||
1240 | ENTRY(camellia_xts_enc_16way) | ||
1241 | /* input: | ||
1242 | * %rdi: ctx, CTX | ||
1243 | * %rsi: dst (16 blocks) | ||
1244 | * %rdx: src (16 blocks) | ||
1245 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1246 | */ | ||
1247 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | ||
1248 | |||
1249 | leaq __camellia_enc_blk16, %r9; | ||
1250 | |||
1251 | jmp camellia_xts_crypt_16way; | ||
1252 | ENDPROC(camellia_xts_enc_16way) | ||
1253 | |||
1254 | ENTRY(camellia_xts_dec_16way) | ||
1255 | /* input: | ||
1256 | * %rdi: ctx, CTX | ||
1257 | * %rsi: dst (16 blocks) | ||
1258 | * %rdx: src (16 blocks) | ||
1259 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1260 | */ | ||
1261 | |||
1262 | cmpl $16, key_length(CTX); | ||
1263 | movl $32, %r8d; | ||
1264 | movl $24, %eax; | ||
1265 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | ||
1266 | |||
1267 | leaq __camellia_dec_blk16, %r9; | ||
1268 | |||
1269 | jmp camellia_xts_crypt_16way; | ||
1270 | ENDPROC(camellia_xts_dec_16way) | ||
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S new file mode 100644 index 000000000000..91a1878fcc3e --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S | |||
@@ -0,0 +1,1368 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2/AES-NI assembler implementation of Camellia | ||
3 | * | ||
4 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | |||
15 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
16 | |||
17 | /* struct camellia_ctx: */ | ||
18 | #define key_table 0 | ||
19 | #define key_length CAMELLIA_TABLE_BYTE_LEN | ||
20 | |||
21 | /* register macros */ | ||
22 | #define CTX %rdi | ||
23 | #define RIO %r8 | ||
24 | |||
25 | /********************************************************************** | ||
26 | helper macros | ||
27 | **********************************************************************/ | ||
28 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ | ||
29 | vpand x, mask4bit, tmp0; \ | ||
30 | vpandn x, mask4bit, x; \ | ||
31 | vpsrld $4, x, x; \ | ||
32 | \ | ||
33 | vpshufb tmp0, lo_t, tmp0; \ | ||
34 | vpshufb x, hi_t, x; \ | ||
35 | vpxor tmp0, x, x; | ||
36 | |||
37 | #define ymm0_x xmm0 | ||
38 | #define ymm1_x xmm1 | ||
39 | #define ymm2_x xmm2 | ||
40 | #define ymm3_x xmm3 | ||
41 | #define ymm4_x xmm4 | ||
42 | #define ymm5_x xmm5 | ||
43 | #define ymm6_x xmm6 | ||
44 | #define ymm7_x xmm7 | ||
45 | #define ymm8_x xmm8 | ||
46 | #define ymm9_x xmm9 | ||
47 | #define ymm10_x xmm10 | ||
48 | #define ymm11_x xmm11 | ||
49 | #define ymm12_x xmm12 | ||
50 | #define ymm13_x xmm13 | ||
51 | #define ymm14_x xmm14 | ||
52 | #define ymm15_x xmm15 | ||
53 | |||
54 | /* | ||
55 | * AES-NI instructions do not support ymmX registers, so we need splitting and | ||
56 | * merging. | ||
57 | */ | ||
58 | #define vaesenclast256(zero, yreg, tmp) \ | ||
59 | vextracti128 $1, yreg, tmp##_x; \ | ||
60 | vaesenclast zero##_x, yreg##_x, yreg##_x; \ | ||
61 | vaesenclast zero##_x, tmp##_x, tmp##_x; \ | ||
62 | vinserti128 $1, tmp##_x, yreg, yreg; | ||
63 | |||
64 | /********************************************************************** | ||
65 | 32-way camellia | ||
66 | **********************************************************************/ | ||
67 | |||
68 | /* | ||
69 | * IN: | ||
70 | * x0..x7: byte-sliced AB state | ||
71 | * mem_cd: register pointer storing CD state | ||
72 | * key: index for key material | ||
73 | * OUT: | ||
74 | * x0..x7: new byte-sliced CD state | ||
75 | */ | ||
76 | #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ | ||
77 | t7, mem_cd, key) \ | ||
78 | /* \ | ||
79 | * S-function with AES subbytes \ | ||
80 | */ \ | ||
81 | vbroadcasti128 .Linv_shift_row, t4; \ | ||
82 | vpbroadcastb .L0f0f0f0f, t7; \ | ||
83 | vbroadcasti128 .Lpre_tf_lo_s1, t0; \ | ||
84 | vbroadcasti128 .Lpre_tf_hi_s1, t1; \ | ||
85 | \ | ||
86 | /* AES inverse shift rows */ \ | ||
87 | vpshufb t4, x0, x0; \ | ||
88 | vpshufb t4, x7, x7; \ | ||
89 | vpshufb t4, x1, x1; \ | ||
90 | vpshufb t4, x4, x4; \ | ||
91 | vpshufb t4, x2, x2; \ | ||
92 | vpshufb t4, x5, x5; \ | ||
93 | vpshufb t4, x3, x3; \ | ||
94 | vpshufb t4, x6, x6; \ | ||
95 | \ | ||
96 | /* prefilter sboxes 1, 2 and 3 */ \ | ||
97 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | ||
98 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | ||
99 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
100 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
101 | filter_8bit(x1, t0, t1, t7, t6); \ | ||
102 | filter_8bit(x4, t0, t1, t7, t6); \ | ||
103 | filter_8bit(x2, t0, t1, t7, t6); \ | ||
104 | filter_8bit(x5, t0, t1, t7, t6); \ | ||
105 | \ | ||
106 | /* prefilter sbox 4 */ \ | ||
107 | vpxor t4##_x, t4##_x, t4##_x; \ | ||
108 | filter_8bit(x3, t2, t3, t7, t6); \ | ||
109 | filter_8bit(x6, t2, t3, t7, t6); \ | ||
110 | \ | ||
111 | /* AES subbytes + AES shift rows */ \ | ||
112 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ | ||
113 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ | ||
114 | vaesenclast256(t4, x0, t5); \ | ||
115 | vaesenclast256(t4, x7, t5); \ | ||
116 | vaesenclast256(t4, x1, t5); \ | ||
117 | vaesenclast256(t4, x4, t5); \ | ||
118 | vaesenclast256(t4, x2, t5); \ | ||
119 | vaesenclast256(t4, x5, t5); \ | ||
120 | vaesenclast256(t4, x3, t5); \ | ||
121 | vaesenclast256(t4, x6, t5); \ | ||
122 | \ | ||
123 | /* postfilter sboxes 1 and 4 */ \ | ||
124 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ | ||
125 | vbroadcasti128 .Lpost_tf_hi_s3, t3; \ | ||
126 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
127 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
128 | filter_8bit(x3, t0, t1, t7, t6); \ | ||
129 | filter_8bit(x6, t0, t1, t7, t6); \ | ||
130 | \ | ||
131 | /* postfilter sbox 3 */ \ | ||
132 | vbroadcasti128 .Lpost_tf_lo_s2, t4; \ | ||
133 | vbroadcasti128 .Lpost_tf_hi_s2, t5; \ | ||
134 | filter_8bit(x2, t2, t3, t7, t6); \ | ||
135 | filter_8bit(x5, t2, t3, t7, t6); \ | ||
136 | \ | ||
137 | vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ | ||
138 | \ | ||
139 | /* postfilter sbox 2 */ \ | ||
140 | filter_8bit(x1, t4, t5, t7, t2); \ | ||
141 | filter_8bit(x4, t4, t5, t7, t2); \ | ||
142 | \ | ||
143 | vpsrldq $1, t0, t1; \ | ||
144 | vpsrldq $2, t0, t2; \ | ||
145 | vpsrldq $3, t0, t3; \ | ||
146 | vpsrldq $4, t0, t4; \ | ||
147 | vpsrldq $5, t0, t5; \ | ||
148 | vpsrldq $6, t0, t6; \ | ||
149 | vpsrldq $7, t0, t7; \ | ||
150 | vpbroadcastb t0##_x, t0; \ | ||
151 | vpbroadcastb t1##_x, t1; \ | ||
152 | vpbroadcastb t2##_x, t2; \ | ||
153 | vpbroadcastb t3##_x, t3; \ | ||
154 | vpbroadcastb t4##_x, t4; \ | ||
155 | vpbroadcastb t6##_x, t6; \ | ||
156 | vpbroadcastb t5##_x, t5; \ | ||
157 | vpbroadcastb t7##_x, t7; \ | ||
158 | \ | ||
159 | /* P-function */ \ | ||
160 | vpxor x5, x0, x0; \ | ||
161 | vpxor x6, x1, x1; \ | ||
162 | vpxor x7, x2, x2; \ | ||
163 | vpxor x4, x3, x3; \ | ||
164 | \ | ||
165 | vpxor x2, x4, x4; \ | ||
166 | vpxor x3, x5, x5; \ | ||
167 | vpxor x0, x6, x6; \ | ||
168 | vpxor x1, x7, x7; \ | ||
169 | \ | ||
170 | vpxor x7, x0, x0; \ | ||
171 | vpxor x4, x1, x1; \ | ||
172 | vpxor x5, x2, x2; \ | ||
173 | vpxor x6, x3, x3; \ | ||
174 | \ | ||
175 | vpxor x3, x4, x4; \ | ||
176 | vpxor x0, x5, x5; \ | ||
177 | vpxor x1, x6, x6; \ | ||
178 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ | ||
179 | \ | ||
180 | /* Add key material and result to CD (x becomes new CD) */ \ | ||
181 | \ | ||
182 | vpxor t7, x0, x0; \ | ||
183 | vpxor 4 * 32(mem_cd), x0, x0; \ | ||
184 | \ | ||
185 | vpxor t6, x1, x1; \ | ||
186 | vpxor 5 * 32(mem_cd), x1, x1; \ | ||
187 | \ | ||
188 | vpxor t5, x2, x2; \ | ||
189 | vpxor 6 * 32(mem_cd), x2, x2; \ | ||
190 | \ | ||
191 | vpxor t4, x3, x3; \ | ||
192 | vpxor 7 * 32(mem_cd), x3, x3; \ | ||
193 | \ | ||
194 | vpxor t3, x4, x4; \ | ||
195 | vpxor 0 * 32(mem_cd), x4, x4; \ | ||
196 | \ | ||
197 | vpxor t2, x5, x5; \ | ||
198 | vpxor 1 * 32(mem_cd), x5, x5; \ | ||
199 | \ | ||
200 | vpxor t1, x6, x6; \ | ||
201 | vpxor 2 * 32(mem_cd), x6, x6; \ | ||
202 | \ | ||
203 | vpxor t0, x7, x7; \ | ||
204 | vpxor 3 * 32(mem_cd), x7, x7; | ||
205 | |||
206 | /* | ||
207 | * Size optimization... with inlined roundsm16 binary would be over 5 times | ||
208 | * larger and would only marginally faster. | ||
209 | */ | ||
210 | .align 8 | ||
211 | roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: | ||
212 | roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
213 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, | ||
214 | %rcx, (%r9)); | ||
215 | ret; | ||
216 | ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) | ||
217 | |||
218 | .align 8 | ||
219 | roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: | ||
220 | roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, | ||
221 | %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, | ||
222 | %rax, (%r9)); | ||
223 | ret; | ||
224 | ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | ||
225 | |||
226 | /* | ||
227 | * IN/OUT: | ||
228 | * x0..x7: byte-sliced AB state preloaded | ||
229 | * mem_ab: byte-sliced AB state in memory | ||
230 | * mem_cb: byte-sliced CD state in memory | ||
231 | */ | ||
232 | #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
233 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ | ||
234 | leaq (key_table + (i) * 8)(CTX), %r9; \ | ||
235 | call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ | ||
236 | \ | ||
237 | vmovdqu x0, 4 * 32(mem_cd); \ | ||
238 | vmovdqu x1, 5 * 32(mem_cd); \ | ||
239 | vmovdqu x2, 6 * 32(mem_cd); \ | ||
240 | vmovdqu x3, 7 * 32(mem_cd); \ | ||
241 | vmovdqu x4, 0 * 32(mem_cd); \ | ||
242 | vmovdqu x5, 1 * 32(mem_cd); \ | ||
243 | vmovdqu x6, 2 * 32(mem_cd); \ | ||
244 | vmovdqu x7, 3 * 32(mem_cd); \ | ||
245 | \ | ||
246 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ | ||
247 | call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ | ||
248 | \ | ||
249 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); | ||
250 | |||
251 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ | ||
252 | |||
253 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ | ||
254 | /* Store new AB state */ \ | ||
255 | vmovdqu x4, 4 * 32(mem_ab); \ | ||
256 | vmovdqu x5, 5 * 32(mem_ab); \ | ||
257 | vmovdqu x6, 6 * 32(mem_ab); \ | ||
258 | vmovdqu x7, 7 * 32(mem_ab); \ | ||
259 | vmovdqu x0, 0 * 32(mem_ab); \ | ||
260 | vmovdqu x1, 1 * 32(mem_ab); \ | ||
261 | vmovdqu x2, 2 * 32(mem_ab); \ | ||
262 | vmovdqu x3, 3 * 32(mem_ab); | ||
263 | |||
264 | #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
265 | y6, y7, mem_ab, mem_cd, i) \ | ||
266 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
267 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ | ||
268 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
269 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ | ||
270 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
271 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); | ||
272 | |||
273 | #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
274 | y6, y7, mem_ab, mem_cd, i) \ | ||
275 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
276 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ | ||
277 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
278 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ | ||
279 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
280 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); | ||
281 | |||
282 | /* | ||
283 | * IN: | ||
284 | * v0..3: byte-sliced 32-bit integers | ||
285 | * OUT: | ||
286 | * v0..3: (IN <<< 1) | ||
287 | */ | ||
288 | #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ | ||
289 | vpcmpgtb v0, zero, t0; \ | ||
290 | vpaddb v0, v0, v0; \ | ||
291 | vpabsb t0, t0; \ | ||
292 | \ | ||
293 | vpcmpgtb v1, zero, t1; \ | ||
294 | vpaddb v1, v1, v1; \ | ||
295 | vpabsb t1, t1; \ | ||
296 | \ | ||
297 | vpcmpgtb v2, zero, t2; \ | ||
298 | vpaddb v2, v2, v2; \ | ||
299 | vpabsb t2, t2; \ | ||
300 | \ | ||
301 | vpor t0, v1, v1; \ | ||
302 | \ | ||
303 | vpcmpgtb v3, zero, t0; \ | ||
304 | vpaddb v3, v3, v3; \ | ||
305 | vpabsb t0, t0; \ | ||
306 | \ | ||
307 | vpor t1, v2, v2; \ | ||
308 | vpor t2, v3, v3; \ | ||
309 | vpor t0, v0, v0; | ||
310 | |||
311 | /* | ||
312 | * IN: | ||
313 | * r: byte-sliced AB state in memory | ||
314 | * l: byte-sliced CD state in memory | ||
315 | * OUT: | ||
316 | * x0..x7: new byte-sliced CD state | ||
317 | */ | ||
318 | #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ | ||
319 | tt1, tt2, tt3, kll, klr, krl, krr) \ | ||
320 | /* \ | ||
321 | * t0 = kll; \ | ||
322 | * t0 &= ll; \ | ||
323 | * lr ^= rol32(t0, 1); \ | ||
324 | */ \ | ||
325 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ | ||
326 | vpxor tt0, tt0, tt0; \ | ||
327 | vpbroadcastb t0##_x, t3; \ | ||
328 | vpsrldq $1, t0, t0; \ | ||
329 | vpbroadcastb t0##_x, t2; \ | ||
330 | vpsrldq $1, t0, t0; \ | ||
331 | vpbroadcastb t0##_x, t1; \ | ||
332 | vpsrldq $1, t0, t0; \ | ||
333 | vpbroadcastb t0##_x, t0; \ | ||
334 | \ | ||
335 | vpand l0, t0, t0; \ | ||
336 | vpand l1, t1, t1; \ | ||
337 | vpand l2, t2, t2; \ | ||
338 | vpand l3, t3, t3; \ | ||
339 | \ | ||
340 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
341 | \ | ||
342 | vpxor l4, t0, l4; \ | ||
343 | vmovdqu l4, 4 * 32(l); \ | ||
344 | vpxor l5, t1, l5; \ | ||
345 | vmovdqu l5, 5 * 32(l); \ | ||
346 | vpxor l6, t2, l6; \ | ||
347 | vmovdqu l6, 6 * 32(l); \ | ||
348 | vpxor l7, t3, l7; \ | ||
349 | vmovdqu l7, 7 * 32(l); \ | ||
350 | \ | ||
351 | /* \ | ||
352 | * t2 = krr; \ | ||
353 | * t2 |= rr; \ | ||
354 | * rl ^= t2; \ | ||
355 | */ \ | ||
356 | \ | ||
357 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ | ||
358 | vpbroadcastb t0##_x, t3; \ | ||
359 | vpsrldq $1, t0, t0; \ | ||
360 | vpbroadcastb t0##_x, t2; \ | ||
361 | vpsrldq $1, t0, t0; \ | ||
362 | vpbroadcastb t0##_x, t1; \ | ||
363 | vpsrldq $1, t0, t0; \ | ||
364 | vpbroadcastb t0##_x, t0; \ | ||
365 | \ | ||
366 | vpor 4 * 32(r), t0, t0; \ | ||
367 | vpor 5 * 32(r), t1, t1; \ | ||
368 | vpor 6 * 32(r), t2, t2; \ | ||
369 | vpor 7 * 32(r), t3, t3; \ | ||
370 | \ | ||
371 | vpxor 0 * 32(r), t0, t0; \ | ||
372 | vpxor 1 * 32(r), t1, t1; \ | ||
373 | vpxor 2 * 32(r), t2, t2; \ | ||
374 | vpxor 3 * 32(r), t3, t3; \ | ||
375 | vmovdqu t0, 0 * 32(r); \ | ||
376 | vmovdqu t1, 1 * 32(r); \ | ||
377 | vmovdqu t2, 2 * 32(r); \ | ||
378 | vmovdqu t3, 3 * 32(r); \ | ||
379 | \ | ||
380 | /* \ | ||
381 | * t2 = krl; \ | ||
382 | * t2 &= rl; \ | ||
383 | * rr ^= rol32(t2, 1); \ | ||
384 | */ \ | ||
385 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ | ||
386 | vpbroadcastb t0##_x, t3; \ | ||
387 | vpsrldq $1, t0, t0; \ | ||
388 | vpbroadcastb t0##_x, t2; \ | ||
389 | vpsrldq $1, t0, t0; \ | ||
390 | vpbroadcastb t0##_x, t1; \ | ||
391 | vpsrldq $1, t0, t0; \ | ||
392 | vpbroadcastb t0##_x, t0; \ | ||
393 | \ | ||
394 | vpand 0 * 32(r), t0, t0; \ | ||
395 | vpand 1 * 32(r), t1, t1; \ | ||
396 | vpand 2 * 32(r), t2, t2; \ | ||
397 | vpand 3 * 32(r), t3, t3; \ | ||
398 | \ | ||
399 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
400 | \ | ||
401 | vpxor 4 * 32(r), t0, t0; \ | ||
402 | vpxor 5 * 32(r), t1, t1; \ | ||
403 | vpxor 6 * 32(r), t2, t2; \ | ||
404 | vpxor 7 * 32(r), t3, t3; \ | ||
405 | vmovdqu t0, 4 * 32(r); \ | ||
406 | vmovdqu t1, 5 * 32(r); \ | ||
407 | vmovdqu t2, 6 * 32(r); \ | ||
408 | vmovdqu t3, 7 * 32(r); \ | ||
409 | \ | ||
410 | /* \ | ||
411 | * t0 = klr; \ | ||
412 | * t0 |= lr; \ | ||
413 | * ll ^= t0; \ | ||
414 | */ \ | ||
415 | \ | ||
416 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ | ||
417 | vpbroadcastb t0##_x, t3; \ | ||
418 | vpsrldq $1, t0, t0; \ | ||
419 | vpbroadcastb t0##_x, t2; \ | ||
420 | vpsrldq $1, t0, t0; \ | ||
421 | vpbroadcastb t0##_x, t1; \ | ||
422 | vpsrldq $1, t0, t0; \ | ||
423 | vpbroadcastb t0##_x, t0; \ | ||
424 | \ | ||
425 | vpor l4, t0, t0; \ | ||
426 | vpor l5, t1, t1; \ | ||
427 | vpor l6, t2, t2; \ | ||
428 | vpor l7, t3, t3; \ | ||
429 | \ | ||
430 | vpxor l0, t0, l0; \ | ||
431 | vmovdqu l0, 0 * 32(l); \ | ||
432 | vpxor l1, t1, l1; \ | ||
433 | vmovdqu l1, 1 * 32(l); \ | ||
434 | vpxor l2, t2, l2; \ | ||
435 | vmovdqu l2, 2 * 32(l); \ | ||
436 | vpxor l3, t3, l3; \ | ||
437 | vmovdqu l3, 3 * 32(l); | ||
438 | |||
439 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ | ||
440 | vpunpckhdq x1, x0, t2; \ | ||
441 | vpunpckldq x1, x0, x0; \ | ||
442 | \ | ||
443 | vpunpckldq x3, x2, t1; \ | ||
444 | vpunpckhdq x3, x2, x2; \ | ||
445 | \ | ||
446 | vpunpckhqdq t1, x0, x1; \ | ||
447 | vpunpcklqdq t1, x0, x0; \ | ||
448 | \ | ||
449 | vpunpckhqdq x2, t2, x3; \ | ||
450 | vpunpcklqdq x2, t2, x2; | ||
451 | |||
452 | #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ | ||
453 | a3, b3, c3, d3, st0, st1) \ | ||
454 | vmovdqu d2, st0; \ | ||
455 | vmovdqu d3, st1; \ | ||
456 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ | ||
457 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ | ||
458 | vmovdqu st0, d2; \ | ||
459 | vmovdqu st1, d3; \ | ||
460 | \ | ||
461 | vmovdqu a0, st0; \ | ||
462 | vmovdqu a1, st1; \ | ||
463 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ | ||
464 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ | ||
465 | \ | ||
466 | vbroadcasti128 .Lshufb_16x16b, a0; \ | ||
467 | vmovdqu st1, a1; \ | ||
468 | vpshufb a0, a2, a2; \ | ||
469 | vpshufb a0, a3, a3; \ | ||
470 | vpshufb a0, b0, b0; \ | ||
471 | vpshufb a0, b1, b1; \ | ||
472 | vpshufb a0, b2, b2; \ | ||
473 | vpshufb a0, b3, b3; \ | ||
474 | vpshufb a0, a1, a1; \ | ||
475 | vpshufb a0, c0, c0; \ | ||
476 | vpshufb a0, c1, c1; \ | ||
477 | vpshufb a0, c2, c2; \ | ||
478 | vpshufb a0, c3, c3; \ | ||
479 | vpshufb a0, d0, d0; \ | ||
480 | vpshufb a0, d1, d1; \ | ||
481 | vpshufb a0, d2, d2; \ | ||
482 | vpshufb a0, d3, d3; \ | ||
483 | vmovdqu d3, st1; \ | ||
484 | vmovdqu st0, d3; \ | ||
485 | vpshufb a0, d3, a0; \ | ||
486 | vmovdqu d2, st0; \ | ||
487 | \ | ||
488 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ | ||
489 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ | ||
490 | vmovdqu st0, d2; \ | ||
491 | vmovdqu st1, d3; \ | ||
492 | \ | ||
493 | vmovdqu b0, st0; \ | ||
494 | vmovdqu b1, st1; \ | ||
495 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ | ||
496 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ | ||
497 | vmovdqu st0, b0; \ | ||
498 | vmovdqu st1, b1; \ | ||
499 | /* does not adjust output bytes inside vectors */ | ||
500 | |||
501 | /* load blocks to registers and apply pre-whitening */ | ||
502 | #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
503 | y6, y7, rio, key) \ | ||
504 | vpbroadcastq key, x0; \ | ||
505 | vpshufb .Lpack_bswap, x0, x0; \ | ||
506 | \ | ||
507 | vpxor 0 * 32(rio), x0, y7; \ | ||
508 | vpxor 1 * 32(rio), x0, y6; \ | ||
509 | vpxor 2 * 32(rio), x0, y5; \ | ||
510 | vpxor 3 * 32(rio), x0, y4; \ | ||
511 | vpxor 4 * 32(rio), x0, y3; \ | ||
512 | vpxor 5 * 32(rio), x0, y2; \ | ||
513 | vpxor 6 * 32(rio), x0, y1; \ | ||
514 | vpxor 7 * 32(rio), x0, y0; \ | ||
515 | vpxor 8 * 32(rio), x0, x7; \ | ||
516 | vpxor 9 * 32(rio), x0, x6; \ | ||
517 | vpxor 10 * 32(rio), x0, x5; \ | ||
518 | vpxor 11 * 32(rio), x0, x4; \ | ||
519 | vpxor 12 * 32(rio), x0, x3; \ | ||
520 | vpxor 13 * 32(rio), x0, x2; \ | ||
521 | vpxor 14 * 32(rio), x0, x1; \ | ||
522 | vpxor 15 * 32(rio), x0, x0; | ||
523 | |||
524 | /* byteslice pre-whitened blocks and store to temporary memory */ | ||
525 | #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
526 | y6, y7, mem_ab, mem_cd) \ | ||
527 | byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ | ||
528 | y4, y5, y6, y7, (mem_ab), (mem_cd)); \ | ||
529 | \ | ||
530 | vmovdqu x0, 0 * 32(mem_ab); \ | ||
531 | vmovdqu x1, 1 * 32(mem_ab); \ | ||
532 | vmovdqu x2, 2 * 32(mem_ab); \ | ||
533 | vmovdqu x3, 3 * 32(mem_ab); \ | ||
534 | vmovdqu x4, 4 * 32(mem_ab); \ | ||
535 | vmovdqu x5, 5 * 32(mem_ab); \ | ||
536 | vmovdqu x6, 6 * 32(mem_ab); \ | ||
537 | vmovdqu x7, 7 * 32(mem_ab); \ | ||
538 | vmovdqu y0, 0 * 32(mem_cd); \ | ||
539 | vmovdqu y1, 1 * 32(mem_cd); \ | ||
540 | vmovdqu y2, 2 * 32(mem_cd); \ | ||
541 | vmovdqu y3, 3 * 32(mem_cd); \ | ||
542 | vmovdqu y4, 4 * 32(mem_cd); \ | ||
543 | vmovdqu y5, 5 * 32(mem_cd); \ | ||
544 | vmovdqu y6, 6 * 32(mem_cd); \ | ||
545 | vmovdqu y7, 7 * 32(mem_cd); | ||
546 | |||
547 | /* de-byteslice, apply post-whitening and store blocks */ | ||
548 | #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | ||
549 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ | ||
550 | byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ | ||
551 | y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ | ||
552 | \ | ||
553 | vmovdqu x0, stack_tmp0; \ | ||
554 | \ | ||
555 | vpbroadcastq key, x0; \ | ||
556 | vpshufb .Lpack_bswap, x0, x0; \ | ||
557 | \ | ||
558 | vpxor x0, y7, y7; \ | ||
559 | vpxor x0, y6, y6; \ | ||
560 | vpxor x0, y5, y5; \ | ||
561 | vpxor x0, y4, y4; \ | ||
562 | vpxor x0, y3, y3; \ | ||
563 | vpxor x0, y2, y2; \ | ||
564 | vpxor x0, y1, y1; \ | ||
565 | vpxor x0, y0, y0; \ | ||
566 | vpxor x0, x7, x7; \ | ||
567 | vpxor x0, x6, x6; \ | ||
568 | vpxor x0, x5, x5; \ | ||
569 | vpxor x0, x4, x4; \ | ||
570 | vpxor x0, x3, x3; \ | ||
571 | vpxor x0, x2, x2; \ | ||
572 | vpxor x0, x1, x1; \ | ||
573 | vpxor stack_tmp0, x0, x0; | ||
574 | |||
575 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
576 | y6, y7, rio) \ | ||
577 | vmovdqu x0, 0 * 32(rio); \ | ||
578 | vmovdqu x1, 1 * 32(rio); \ | ||
579 | vmovdqu x2, 2 * 32(rio); \ | ||
580 | vmovdqu x3, 3 * 32(rio); \ | ||
581 | vmovdqu x4, 4 * 32(rio); \ | ||
582 | vmovdqu x5, 5 * 32(rio); \ | ||
583 | vmovdqu x6, 6 * 32(rio); \ | ||
584 | vmovdqu x7, 7 * 32(rio); \ | ||
585 | vmovdqu y0, 8 * 32(rio); \ | ||
586 | vmovdqu y1, 9 * 32(rio); \ | ||
587 | vmovdqu y2, 10 * 32(rio); \ | ||
588 | vmovdqu y3, 11 * 32(rio); \ | ||
589 | vmovdqu y4, 12 * 32(rio); \ | ||
590 | vmovdqu y5, 13 * 32(rio); \ | ||
591 | vmovdqu y6, 14 * 32(rio); \ | ||
592 | vmovdqu y7, 15 * 32(rio); | ||
593 | |||
594 | .data | ||
595 | .align 32 | ||
596 | |||
597 | #define SHUFB_BYTES(idx) \ | ||
598 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) | ||
599 | |||
600 | .Lshufb_16x16b: | ||
601 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) | ||
602 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) | ||
603 | |||
604 | .Lpack_bswap: | ||
605 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 | ||
606 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 | ||
607 | |||
608 | /* For CTR-mode IV byteswap */ | ||
609 | .Lbswap128_mask: | ||
610 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
611 | |||
612 | /* For XTS mode */ | ||
613 | .Lxts_gf128mul_and_shl1_mask_0: | ||
614 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
615 | .Lxts_gf128mul_and_shl1_mask_1: | ||
616 | .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 | ||
617 | |||
618 | /* | ||
619 | * pre-SubByte transform | ||
620 | * | ||
621 | * pre-lookup for sbox1, sbox2, sbox3: | ||
622 | * swap_bitendianness( | ||
623 | * isom_map_camellia_to_aes( | ||
624 | * camellia_f( | ||
625 | * swap_bitendianess(in) | ||
626 | * ) | ||
627 | * ) | ||
628 | * ) | ||
629 | * | ||
630 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
631 | */ | ||
632 | .Lpre_tf_lo_s1: | ||
633 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 | ||
634 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 | ||
635 | .Lpre_tf_hi_s1: | ||
636 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a | ||
637 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 | ||
638 | |||
639 | /* | ||
640 | * pre-SubByte transform | ||
641 | * | ||
642 | * pre-lookup for sbox4: | ||
643 | * swap_bitendianness( | ||
644 | * isom_map_camellia_to_aes( | ||
645 | * camellia_f( | ||
646 | * swap_bitendianess(in <<< 1) | ||
647 | * ) | ||
648 | * ) | ||
649 | * ) | ||
650 | * | ||
651 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
652 | */ | ||
653 | .Lpre_tf_lo_s4: | ||
654 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 | ||
655 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 | ||
656 | .Lpre_tf_hi_s4: | ||
657 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 | ||
658 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf | ||
659 | |||
660 | /* | ||
661 | * post-SubByte transform | ||
662 | * | ||
663 | * post-lookup for sbox1, sbox4: | ||
664 | * swap_bitendianness( | ||
665 | * camellia_h( | ||
666 | * isom_map_aes_to_camellia( | ||
667 | * swap_bitendianness( | ||
668 | * aes_inverse_affine_transform(in) | ||
669 | * ) | ||
670 | * ) | ||
671 | * ) | ||
672 | * ) | ||
673 | * | ||
674 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
675 | */ | ||
676 | .Lpost_tf_lo_s1: | ||
677 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 | ||
678 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 | ||
679 | .Lpost_tf_hi_s1: | ||
680 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 | ||
681 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c | ||
682 | |||
683 | /* | ||
684 | * post-SubByte transform | ||
685 | * | ||
686 | * post-lookup for sbox2: | ||
687 | * swap_bitendianness( | ||
688 | * camellia_h( | ||
689 | * isom_map_aes_to_camellia( | ||
690 | * swap_bitendianness( | ||
691 | * aes_inverse_affine_transform(in) | ||
692 | * ) | ||
693 | * ) | ||
694 | * ) | ||
695 | * ) <<< 1 | ||
696 | * | ||
697 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
698 | */ | ||
699 | .Lpost_tf_lo_s2: | ||
700 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 | ||
701 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 | ||
702 | .Lpost_tf_hi_s2: | ||
703 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 | ||
704 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 | ||
705 | |||
706 | /* | ||
707 | * post-SubByte transform | ||
708 | * | ||
709 | * post-lookup for sbox3: | ||
710 | * swap_bitendianness( | ||
711 | * camellia_h( | ||
712 | * isom_map_aes_to_camellia( | ||
713 | * swap_bitendianness( | ||
714 | * aes_inverse_affine_transform(in) | ||
715 | * ) | ||
716 | * ) | ||
717 | * ) | ||
718 | * ) >>> 1 | ||
719 | * | ||
720 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
721 | */ | ||
722 | .Lpost_tf_lo_s3: | ||
723 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 | ||
724 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 | ||
725 | .Lpost_tf_hi_s3: | ||
726 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 | ||
727 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 | ||
728 | |||
729 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ | ||
730 | .Linv_shift_row: | ||
731 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b | ||
732 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 | ||
733 | |||
734 | .align 4 | ||
735 | /* 4-bit mask */ | ||
736 | .L0f0f0f0f: | ||
737 | .long 0x0f0f0f0f | ||
738 | |||
739 | .text | ||
740 | |||
741 | .align 8 | ||
742 | __camellia_enc_blk32: | ||
743 | /* input: | ||
744 | * %rdi: ctx, CTX | ||
745 | * %rax: temporary storage, 512 bytes | ||
746 | * %ymm0..%ymm15: 32 plaintext blocks | ||
747 | * output: | ||
748 | * %ymm0..%ymm15: 32 encrypted blocks, order swapped: | ||
749 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
750 | */ | ||
751 | |||
752 | leaq 8 * 32(%rax), %rcx; | ||
753 | |||
754 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
755 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
756 | %ymm15, %rax, %rcx); | ||
757 | |||
758 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
759 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
760 | %ymm15, %rax, %rcx, 0); | ||
761 | |||
762 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
763 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
764 | %ymm15, | ||
765 | ((key_table + (8) * 8) + 0)(CTX), | ||
766 | ((key_table + (8) * 8) + 4)(CTX), | ||
767 | ((key_table + (8) * 8) + 8)(CTX), | ||
768 | ((key_table + (8) * 8) + 12)(CTX)); | ||
769 | |||
770 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
771 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
772 | %ymm15, %rax, %rcx, 8); | ||
773 | |||
774 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
775 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
776 | %ymm15, | ||
777 | ((key_table + (16) * 8) + 0)(CTX), | ||
778 | ((key_table + (16) * 8) + 4)(CTX), | ||
779 | ((key_table + (16) * 8) + 8)(CTX), | ||
780 | ((key_table + (16) * 8) + 12)(CTX)); | ||
781 | |||
782 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
783 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
784 | %ymm15, %rax, %rcx, 16); | ||
785 | |||
786 | movl $24, %r8d; | ||
787 | cmpl $16, key_length(CTX); | ||
788 | jne .Lenc_max32; | ||
789 | |||
790 | .Lenc_done: | ||
791 | /* load CD for output */ | ||
792 | vmovdqu 0 * 32(%rcx), %ymm8; | ||
793 | vmovdqu 1 * 32(%rcx), %ymm9; | ||
794 | vmovdqu 2 * 32(%rcx), %ymm10; | ||
795 | vmovdqu 3 * 32(%rcx), %ymm11; | ||
796 | vmovdqu 4 * 32(%rcx), %ymm12; | ||
797 | vmovdqu 5 * 32(%rcx), %ymm13; | ||
798 | vmovdqu 6 * 32(%rcx), %ymm14; | ||
799 | vmovdqu 7 * 32(%rcx), %ymm15; | ||
800 | |||
801 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
802 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
803 | %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); | ||
804 | |||
805 | ret; | ||
806 | |||
807 | .align 8 | ||
808 | .Lenc_max32: | ||
809 | movl $32, %r8d; | ||
810 | |||
811 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
812 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
813 | %ymm15, | ||
814 | ((key_table + (24) * 8) + 0)(CTX), | ||
815 | ((key_table + (24) * 8) + 4)(CTX), | ||
816 | ((key_table + (24) * 8) + 8)(CTX), | ||
817 | ((key_table + (24) * 8) + 12)(CTX)); | ||
818 | |||
819 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
820 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
821 | %ymm15, %rax, %rcx, 24); | ||
822 | |||
823 | jmp .Lenc_done; | ||
824 | ENDPROC(__camellia_enc_blk32) | ||
825 | |||
826 | .align 8 | ||
827 | __camellia_dec_blk32: | ||
828 | /* input: | ||
829 | * %rdi: ctx, CTX | ||
830 | * %rax: temporary storage, 512 bytes | ||
831 | * %r8d: 24 for 16 byte key, 32 for larger | ||
832 | * %ymm0..%ymm15: 16 encrypted blocks | ||
833 | * output: | ||
834 | * %ymm0..%ymm15: 16 plaintext blocks, order swapped: | ||
835 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
836 | */ | ||
837 | |||
838 | leaq 8 * 32(%rax), %rcx; | ||
839 | |||
840 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
841 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
842 | %ymm15, %rax, %rcx); | ||
843 | |||
844 | cmpl $32, %r8d; | ||
845 | je .Ldec_max32; | ||
846 | |||
847 | .Ldec_max24: | ||
848 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
849 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
850 | %ymm15, %rax, %rcx, 16); | ||
851 | |||
852 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
853 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
854 | %ymm15, | ||
855 | ((key_table + (16) * 8) + 8)(CTX), | ||
856 | ((key_table + (16) * 8) + 12)(CTX), | ||
857 | ((key_table + (16) * 8) + 0)(CTX), | ||
858 | ((key_table + (16) * 8) + 4)(CTX)); | ||
859 | |||
860 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
861 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
862 | %ymm15, %rax, %rcx, 8); | ||
863 | |||
864 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
865 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
866 | %ymm15, | ||
867 | ((key_table + (8) * 8) + 8)(CTX), | ||
868 | ((key_table + (8) * 8) + 12)(CTX), | ||
869 | ((key_table + (8) * 8) + 0)(CTX), | ||
870 | ((key_table + (8) * 8) + 4)(CTX)); | ||
871 | |||
872 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
873 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
874 | %ymm15, %rax, %rcx, 0); | ||
875 | |||
876 | /* load CD for output */ | ||
877 | vmovdqu 0 * 32(%rcx), %ymm8; | ||
878 | vmovdqu 1 * 32(%rcx), %ymm9; | ||
879 | vmovdqu 2 * 32(%rcx), %ymm10; | ||
880 | vmovdqu 3 * 32(%rcx), %ymm11; | ||
881 | vmovdqu 4 * 32(%rcx), %ymm12; | ||
882 | vmovdqu 5 * 32(%rcx), %ymm13; | ||
883 | vmovdqu 6 * 32(%rcx), %ymm14; | ||
884 | vmovdqu 7 * 32(%rcx), %ymm15; | ||
885 | |||
886 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
887 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
888 | %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); | ||
889 | |||
890 | ret; | ||
891 | |||
892 | .align 8 | ||
893 | .Ldec_max32: | ||
894 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
895 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
896 | %ymm15, %rax, %rcx, 24); | ||
897 | |||
898 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
899 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
900 | %ymm15, | ||
901 | ((key_table + (24) * 8) + 8)(CTX), | ||
902 | ((key_table + (24) * 8) + 12)(CTX), | ||
903 | ((key_table + (24) * 8) + 0)(CTX), | ||
904 | ((key_table + (24) * 8) + 4)(CTX)); | ||
905 | |||
906 | jmp .Ldec_max24; | ||
907 | ENDPROC(__camellia_dec_blk32) | ||
908 | |||
909 | ENTRY(camellia_ecb_enc_32way) | ||
910 | /* input: | ||
911 | * %rdi: ctx, CTX | ||
912 | * %rsi: dst (32 blocks) | ||
913 | * %rdx: src (32 blocks) | ||
914 | */ | ||
915 | |||
916 | vzeroupper; | ||
917 | |||
918 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
919 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
920 | %ymm15, %rdx, (key_table)(CTX)); | ||
921 | |||
922 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
923 | movq %rsi, %rax; | ||
924 | |||
925 | call __camellia_enc_blk32; | ||
926 | |||
927 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
928 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
929 | %ymm8, %rsi); | ||
930 | |||
931 | vzeroupper; | ||
932 | |||
933 | ret; | ||
934 | ENDPROC(camellia_ecb_enc_32way) | ||
935 | |||
936 | ENTRY(camellia_ecb_dec_32way) | ||
937 | /* input: | ||
938 | * %rdi: ctx, CTX | ||
939 | * %rsi: dst (32 blocks) | ||
940 | * %rdx: src (32 blocks) | ||
941 | */ | ||
942 | |||
943 | vzeroupper; | ||
944 | |||
945 | cmpl $16, key_length(CTX); | ||
946 | movl $32, %r8d; | ||
947 | movl $24, %eax; | ||
948 | cmovel %eax, %r8d; /* max */ | ||
949 | |||
950 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
951 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
952 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
953 | |||
954 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
955 | movq %rsi, %rax; | ||
956 | |||
957 | call __camellia_dec_blk32; | ||
958 | |||
959 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
960 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
961 | %ymm8, %rsi); | ||
962 | |||
963 | vzeroupper; | ||
964 | |||
965 | ret; | ||
966 | ENDPROC(camellia_ecb_dec_32way) | ||
967 | |||
968 | ENTRY(camellia_cbc_dec_32way) | ||
969 | /* input: | ||
970 | * %rdi: ctx, CTX | ||
971 | * %rsi: dst (32 blocks) | ||
972 | * %rdx: src (32 blocks) | ||
973 | */ | ||
974 | |||
975 | vzeroupper; | ||
976 | |||
977 | cmpl $16, key_length(CTX); | ||
978 | movl $32, %r8d; | ||
979 | movl $24, %eax; | ||
980 | cmovel %eax, %r8d; /* max */ | ||
981 | |||
982 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
983 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
984 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
985 | |||
986 | movq %rsp, %r10; | ||
987 | cmpq %rsi, %rdx; | ||
988 | je .Lcbc_dec_use_stack; | ||
989 | |||
990 | /* dst can be used as temporary storage, src is not overwritten. */ | ||
991 | movq %rsi, %rax; | ||
992 | jmp .Lcbc_dec_continue; | ||
993 | |||
994 | .Lcbc_dec_use_stack: | ||
995 | /* | ||
996 | * dst still in-use (because dst == src), so use stack for temporary | ||
997 | * storage. | ||
998 | */ | ||
999 | subq $(16 * 32), %rsp; | ||
1000 | movq %rsp, %rax; | ||
1001 | |||
1002 | .Lcbc_dec_continue: | ||
1003 | call __camellia_dec_blk32; | ||
1004 | |||
1005 | vmovdqu %ymm7, (%rax); | ||
1006 | vpxor %ymm7, %ymm7, %ymm7; | ||
1007 | vinserti128 $1, (%rdx), %ymm7, %ymm7; | ||
1008 | vpxor (%rax), %ymm7, %ymm7; | ||
1009 | movq %r10, %rsp; | ||
1010 | vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; | ||
1011 | vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; | ||
1012 | vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; | ||
1013 | vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; | ||
1014 | vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; | ||
1015 | vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; | ||
1016 | vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; | ||
1017 | vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; | ||
1018 | vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; | ||
1019 | vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; | ||
1020 | vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; | ||
1021 | vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; | ||
1022 | vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; | ||
1023 | vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; | ||
1024 | vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; | ||
1025 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
1026 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
1027 | %ymm8, %rsi); | ||
1028 | |||
1029 | vzeroupper; | ||
1030 | |||
1031 | ret; | ||
1032 | ENDPROC(camellia_cbc_dec_32way) | ||
1033 | |||
1034 | #define inc_le128(x, minus_one, tmp) \ | ||
1035 | vpcmpeqq minus_one, x, tmp; \ | ||
1036 | vpsubq minus_one, x, x; \ | ||
1037 | vpslldq $8, tmp, tmp; \ | ||
1038 | vpsubq tmp, x, x; | ||
1039 | |||
1040 | #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ | ||
1041 | vpcmpeqq minus_one, x, tmp1; \ | ||
1042 | vpcmpeqq minus_two, x, tmp2; \ | ||
1043 | vpsubq minus_two, x, x; \ | ||
1044 | vpor tmp2, tmp1, tmp1; \ | ||
1045 | vpslldq $8, tmp1, tmp1; \ | ||
1046 | vpsubq tmp1, x, x; | ||
1047 | |||
1048 | ENTRY(camellia_ctr_32way) | ||
1049 | /* input: | ||
1050 | * %rdi: ctx, CTX | ||
1051 | * %rsi: dst (32 blocks) | ||
1052 | * %rdx: src (32 blocks) | ||
1053 | * %rcx: iv (little endian, 128bit) | ||
1054 | */ | ||
1055 | |||
1056 | vzeroupper; | ||
1057 | |||
1058 | movq %rsp, %r10; | ||
1059 | cmpq %rsi, %rdx; | ||
1060 | je .Lctr_use_stack; | ||
1061 | |||
1062 | /* dst can be used as temporary storage, src is not overwritten. */ | ||
1063 | movq %rsi, %rax; | ||
1064 | jmp .Lctr_continue; | ||
1065 | |||
1066 | .Lctr_use_stack: | ||
1067 | subq $(16 * 32), %rsp; | ||
1068 | movq %rsp, %rax; | ||
1069 | |||
1070 | .Lctr_continue: | ||
1071 | vpcmpeqd %ymm15, %ymm15, %ymm15; | ||
1072 | vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ | ||
1073 | vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ | ||
1074 | |||
1075 | /* load IV and byteswap */ | ||
1076 | vmovdqu (%rcx), %xmm0; | ||
1077 | vmovdqa %xmm0, %xmm1; | ||
1078 | inc_le128(%xmm0, %xmm15, %xmm14); | ||
1079 | vbroadcasti128 .Lbswap128_mask, %ymm14; | ||
1080 | vinserti128 $1, %xmm0, %ymm1, %ymm0; | ||
1081 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1082 | vmovdqu %ymm13, 15 * 32(%rax); | ||
1083 | |||
1084 | /* construct IVs */ | ||
1085 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ | ||
1086 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1087 | vmovdqu %ymm13, 14 * 32(%rax); | ||
1088 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1089 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1090 | vmovdqu %ymm13, 13 * 32(%rax); | ||
1091 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1092 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1093 | vmovdqu %ymm13, 12 * 32(%rax); | ||
1094 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1095 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1096 | vmovdqu %ymm13, 11 * 32(%rax); | ||
1097 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1098 | vpshufb %ymm14, %ymm0, %ymm10; | ||
1099 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1100 | vpshufb %ymm14, %ymm0, %ymm9; | ||
1101 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1102 | vpshufb %ymm14, %ymm0, %ymm8; | ||
1103 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1104 | vpshufb %ymm14, %ymm0, %ymm7; | ||
1105 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1106 | vpshufb %ymm14, %ymm0, %ymm6; | ||
1107 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1108 | vpshufb %ymm14, %ymm0, %ymm5; | ||
1109 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1110 | vpshufb %ymm14, %ymm0, %ymm4; | ||
1111 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1112 | vpshufb %ymm14, %ymm0, %ymm3; | ||
1113 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1114 | vpshufb %ymm14, %ymm0, %ymm2; | ||
1115 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1116 | vpshufb %ymm14, %ymm0, %ymm1; | ||
1117 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1118 | vextracti128 $1, %ymm0, %xmm13; | ||
1119 | vpshufb %ymm14, %ymm0, %ymm0; | ||
1120 | inc_le128(%xmm13, %xmm15, %xmm14); | ||
1121 | vmovdqu %xmm13, (%rcx); | ||
1122 | |||
1123 | /* inpack32_pre: */ | ||
1124 | vpbroadcastq (key_table)(CTX), %ymm15; | ||
1125 | vpshufb .Lpack_bswap, %ymm15, %ymm15; | ||
1126 | vpxor %ymm0, %ymm15, %ymm0; | ||
1127 | vpxor %ymm1, %ymm15, %ymm1; | ||
1128 | vpxor %ymm2, %ymm15, %ymm2; | ||
1129 | vpxor %ymm3, %ymm15, %ymm3; | ||
1130 | vpxor %ymm4, %ymm15, %ymm4; | ||
1131 | vpxor %ymm5, %ymm15, %ymm5; | ||
1132 | vpxor %ymm6, %ymm15, %ymm6; | ||
1133 | vpxor %ymm7, %ymm15, %ymm7; | ||
1134 | vpxor %ymm8, %ymm15, %ymm8; | ||
1135 | vpxor %ymm9, %ymm15, %ymm9; | ||
1136 | vpxor %ymm10, %ymm15, %ymm10; | ||
1137 | vpxor 11 * 32(%rax), %ymm15, %ymm11; | ||
1138 | vpxor 12 * 32(%rax), %ymm15, %ymm12; | ||
1139 | vpxor 13 * 32(%rax), %ymm15, %ymm13; | ||
1140 | vpxor 14 * 32(%rax), %ymm15, %ymm14; | ||
1141 | vpxor 15 * 32(%rax), %ymm15, %ymm15; | ||
1142 | |||
1143 | call __camellia_enc_blk32; | ||
1144 | |||
1145 | movq %r10, %rsp; | ||
1146 | |||
1147 | vpxor 0 * 32(%rdx), %ymm7, %ymm7; | ||
1148 | vpxor 1 * 32(%rdx), %ymm6, %ymm6; | ||
1149 | vpxor 2 * 32(%rdx), %ymm5, %ymm5; | ||
1150 | vpxor 3 * 32(%rdx), %ymm4, %ymm4; | ||
1151 | vpxor 4 * 32(%rdx), %ymm3, %ymm3; | ||
1152 | vpxor 5 * 32(%rdx), %ymm2, %ymm2; | ||
1153 | vpxor 6 * 32(%rdx), %ymm1, %ymm1; | ||
1154 | vpxor 7 * 32(%rdx), %ymm0, %ymm0; | ||
1155 | vpxor 8 * 32(%rdx), %ymm15, %ymm15; | ||
1156 | vpxor 9 * 32(%rdx), %ymm14, %ymm14; | ||
1157 | vpxor 10 * 32(%rdx), %ymm13, %ymm13; | ||
1158 | vpxor 11 * 32(%rdx), %ymm12, %ymm12; | ||
1159 | vpxor 12 * 32(%rdx), %ymm11, %ymm11; | ||
1160 | vpxor 13 * 32(%rdx), %ymm10, %ymm10; | ||
1161 | vpxor 14 * 32(%rdx), %ymm9, %ymm9; | ||
1162 | vpxor 15 * 32(%rdx), %ymm8, %ymm8; | ||
1163 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
1164 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
1165 | %ymm8, %rsi); | ||
1166 | |||
1167 | vzeroupper; | ||
1168 | |||
1169 | ret; | ||
1170 | ENDPROC(camellia_ctr_32way) | ||
1171 | |||
1172 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
1173 | vpsrad $31, iv, tmp; \ | ||
1174 | vpaddq iv, iv, iv; \ | ||
1175 | vpshufd $0x13, tmp, tmp; \ | ||
1176 | vpand mask, tmp, tmp; \ | ||
1177 | vpxor tmp, iv, iv; | ||
1178 | |||
1179 | #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ | ||
1180 | vpsrad $31, iv, tmp0; \ | ||
1181 | vpaddq iv, iv, tmp1; \ | ||
1182 | vpsllq $2, iv, iv; \ | ||
1183 | vpshufd $0x13, tmp0, tmp0; \ | ||
1184 | vpsrad $31, tmp1, tmp1; \ | ||
1185 | vpand mask2, tmp0, tmp0; \ | ||
1186 | vpshufd $0x13, tmp1, tmp1; \ | ||
1187 | vpxor tmp0, iv, iv; \ | ||
1188 | vpand mask1, tmp1, tmp1; \ | ||
1189 | vpxor tmp1, iv, iv; | ||
1190 | |||
1191 | .align 8 | ||
1192 | camellia_xts_crypt_32way: | ||
1193 | /* input: | ||
1194 | * %rdi: ctx, CTX | ||
1195 | * %rsi: dst (32 blocks) | ||
1196 | * %rdx: src (32 blocks) | ||
1197 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1198 | * %r8: index for input whitening key | ||
1199 | * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 | ||
1200 | */ | ||
1201 | |||
1202 | vzeroupper; | ||
1203 | |||
1204 | subq $(16 * 32), %rsp; | ||
1205 | movq %rsp, %rax; | ||
1206 | |||
1207 | vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; | ||
1208 | |||
1209 | /* load IV and construct second IV */ | ||
1210 | vmovdqu (%rcx), %xmm0; | ||
1211 | vmovdqa %xmm0, %xmm15; | ||
1212 | gf128mul_x_ble(%xmm0, %xmm12, %xmm13); | ||
1213 | vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; | ||
1214 | vinserti128 $1, %xmm0, %ymm15, %ymm0; | ||
1215 | vpxor 0 * 32(%rdx), %ymm0, %ymm15; | ||
1216 | vmovdqu %ymm15, 15 * 32(%rax); | ||
1217 | vmovdqu %ymm0, 0 * 32(%rsi); | ||
1218 | |||
1219 | /* construct IVs */ | ||
1220 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1221 | vpxor 1 * 32(%rdx), %ymm0, %ymm15; | ||
1222 | vmovdqu %ymm15, 14 * 32(%rax); | ||
1223 | vmovdqu %ymm0, 1 * 32(%rsi); | ||
1224 | |||
1225 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1226 | vpxor 2 * 32(%rdx), %ymm0, %ymm15; | ||
1227 | vmovdqu %ymm15, 13 * 32(%rax); | ||
1228 | vmovdqu %ymm0, 2 * 32(%rsi); | ||
1229 | |||
1230 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1231 | vpxor 3 * 32(%rdx), %ymm0, %ymm15; | ||
1232 | vmovdqu %ymm15, 12 * 32(%rax); | ||
1233 | vmovdqu %ymm0, 3 * 32(%rsi); | ||
1234 | |||
1235 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1236 | vpxor 4 * 32(%rdx), %ymm0, %ymm11; | ||
1237 | vmovdqu %ymm0, 4 * 32(%rsi); | ||
1238 | |||
1239 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1240 | vpxor 5 * 32(%rdx), %ymm0, %ymm10; | ||
1241 | vmovdqu %ymm0, 5 * 32(%rsi); | ||
1242 | |||
1243 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1244 | vpxor 6 * 32(%rdx), %ymm0, %ymm9; | ||
1245 | vmovdqu %ymm0, 6 * 32(%rsi); | ||
1246 | |||
1247 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1248 | vpxor 7 * 32(%rdx), %ymm0, %ymm8; | ||
1249 | vmovdqu %ymm0, 7 * 32(%rsi); | ||
1250 | |||
1251 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1252 | vpxor 8 * 32(%rdx), %ymm0, %ymm7; | ||
1253 | vmovdqu %ymm0, 8 * 32(%rsi); | ||
1254 | |||
1255 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1256 | vpxor 9 * 32(%rdx), %ymm0, %ymm6; | ||
1257 | vmovdqu %ymm0, 9 * 32(%rsi); | ||
1258 | |||
1259 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1260 | vpxor 10 * 32(%rdx), %ymm0, %ymm5; | ||
1261 | vmovdqu %ymm0, 10 * 32(%rsi); | ||
1262 | |||
1263 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1264 | vpxor 11 * 32(%rdx), %ymm0, %ymm4; | ||
1265 | vmovdqu %ymm0, 11 * 32(%rsi); | ||
1266 | |||
1267 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1268 | vpxor 12 * 32(%rdx), %ymm0, %ymm3; | ||
1269 | vmovdqu %ymm0, 12 * 32(%rsi); | ||
1270 | |||
1271 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1272 | vpxor 13 * 32(%rdx), %ymm0, %ymm2; | ||
1273 | vmovdqu %ymm0, 13 * 32(%rsi); | ||
1274 | |||
1275 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1276 | vpxor 14 * 32(%rdx), %ymm0, %ymm1; | ||
1277 | vmovdqu %ymm0, 14 * 32(%rsi); | ||
1278 | |||
1279 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1280 | vpxor 15 * 32(%rdx), %ymm0, %ymm15; | ||
1281 | vmovdqu %ymm15, 0 * 32(%rax); | ||
1282 | vmovdqu %ymm0, 15 * 32(%rsi); | ||
1283 | |||
1284 | vextracti128 $1, %ymm0, %xmm0; | ||
1285 | gf128mul_x_ble(%xmm0, %xmm12, %xmm15); | ||
1286 | vmovdqu %xmm0, (%rcx); | ||
1287 | |||
1288 | /* inpack32_pre: */ | ||
1289 | vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; | ||
1290 | vpshufb .Lpack_bswap, %ymm15, %ymm15; | ||
1291 | vpxor 0 * 32(%rax), %ymm15, %ymm0; | ||
1292 | vpxor %ymm1, %ymm15, %ymm1; | ||
1293 | vpxor %ymm2, %ymm15, %ymm2; | ||
1294 | vpxor %ymm3, %ymm15, %ymm3; | ||
1295 | vpxor %ymm4, %ymm15, %ymm4; | ||
1296 | vpxor %ymm5, %ymm15, %ymm5; | ||
1297 | vpxor %ymm6, %ymm15, %ymm6; | ||
1298 | vpxor %ymm7, %ymm15, %ymm7; | ||
1299 | vpxor %ymm8, %ymm15, %ymm8; | ||
1300 | vpxor %ymm9, %ymm15, %ymm9; | ||
1301 | vpxor %ymm10, %ymm15, %ymm10; | ||
1302 | vpxor %ymm11, %ymm15, %ymm11; | ||
1303 | vpxor 12 * 32(%rax), %ymm15, %ymm12; | ||
1304 | vpxor 13 * 32(%rax), %ymm15, %ymm13; | ||
1305 | vpxor 14 * 32(%rax), %ymm15, %ymm14; | ||
1306 | vpxor 15 * 32(%rax), %ymm15, %ymm15; | ||
1307 | |||
1308 | call *%r9; | ||
1309 | |||
1310 | addq $(16 * 32), %rsp; | ||
1311 | |||
1312 | vpxor 0 * 32(%rsi), %ymm7, %ymm7; | ||
1313 | vpxor 1 * 32(%rsi), %ymm6, %ymm6; | ||
1314 | vpxor 2 * 32(%rsi), %ymm5, %ymm5; | ||
1315 | vpxor 3 * 32(%rsi), %ymm4, %ymm4; | ||
1316 | vpxor 4 * 32(%rsi), %ymm3, %ymm3; | ||
1317 | vpxor 5 * 32(%rsi), %ymm2, %ymm2; | ||
1318 | vpxor 6 * 32(%rsi), %ymm1, %ymm1; | ||
1319 | vpxor 7 * 32(%rsi), %ymm0, %ymm0; | ||
1320 | vpxor 8 * 32(%rsi), %ymm15, %ymm15; | ||
1321 | vpxor 9 * 32(%rsi), %ymm14, %ymm14; | ||
1322 | vpxor 10 * 32(%rsi), %ymm13, %ymm13; | ||
1323 | vpxor 11 * 32(%rsi), %ymm12, %ymm12; | ||
1324 | vpxor 12 * 32(%rsi), %ymm11, %ymm11; | ||
1325 | vpxor 13 * 32(%rsi), %ymm10, %ymm10; | ||
1326 | vpxor 14 * 32(%rsi), %ymm9, %ymm9; | ||
1327 | vpxor 15 * 32(%rsi), %ymm8, %ymm8; | ||
1328 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
1329 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
1330 | %ymm8, %rsi); | ||
1331 | |||
1332 | vzeroupper; | ||
1333 | |||
1334 | ret; | ||
1335 | ENDPROC(camellia_xts_crypt_32way) | ||
1336 | |||
1337 | ENTRY(camellia_xts_enc_32way) | ||
1338 | /* input: | ||
1339 | * %rdi: ctx, CTX | ||
1340 | * %rsi: dst (32 blocks) | ||
1341 | * %rdx: src (32 blocks) | ||
1342 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1343 | */ | ||
1344 | |||
1345 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | ||
1346 | |||
1347 | leaq __camellia_enc_blk32, %r9; | ||
1348 | |||
1349 | jmp camellia_xts_crypt_32way; | ||
1350 | ENDPROC(camellia_xts_enc_32way) | ||
1351 | |||
1352 | ENTRY(camellia_xts_dec_32way) | ||
1353 | /* input: | ||
1354 | * %rdi: ctx, CTX | ||
1355 | * %rsi: dst (32 blocks) | ||
1356 | * %rdx: src (32 blocks) | ||
1357 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1358 | */ | ||
1359 | |||
1360 | cmpl $16, key_length(CTX); | ||
1361 | movl $32, %r8d; | ||
1362 | movl $24, %eax; | ||
1363 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | ||
1364 | |||
1365 | leaq __camellia_dec_blk32, %r9; | ||
1366 | |||
1367 | jmp camellia_xts_crypt_32way; | ||
1368 | ENDPROC(camellia_xts_dec_32way) | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c new file mode 100644 index 000000000000..414fe5d7946b --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c | |||
@@ -0,0 +1,586 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia | ||
3 | * | ||
4 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/lrw.h> | ||
20 | #include <crypto/xts.h> | ||
21 | #include <asm/xcr.h> | ||
22 | #include <asm/xsave.h> | ||
23 | #include <asm/crypto/camellia.h> | ||
24 | #include <asm/crypto/ablk_helper.h> | ||
25 | #include <asm/crypto/glue_helper.h> | ||
26 | |||
27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 | ||
28 | #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 | ||
29 | |||
30 | /* 32-way AVX2/AES-NI parallel cipher functions */ | ||
31 | asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, | ||
32 | const u8 *src); | ||
33 | asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, | ||
34 | const u8 *src); | ||
35 | |||
36 | asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, | ||
37 | const u8 *src); | ||
38 | asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, | ||
39 | const u8 *src, le128 *iv); | ||
40 | |||
41 | asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, | ||
42 | const u8 *src, le128 *iv); | ||
43 | asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, | ||
44 | const u8 *src, le128 *iv); | ||
45 | |||
46 | static const struct common_glue_ctx camellia_enc = { | ||
47 | .num_funcs = 4, | ||
48 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
49 | |||
50 | .funcs = { { | ||
51 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
52 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } | ||
53 | }, { | ||
54 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
55 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } | ||
56 | }, { | ||
57 | .num_blocks = 2, | ||
58 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } | ||
59 | }, { | ||
60 | .num_blocks = 1, | ||
61 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } | ||
62 | } } | ||
63 | }; | ||
64 | |||
65 | static const struct common_glue_ctx camellia_ctr = { | ||
66 | .num_funcs = 4, | ||
67 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
68 | |||
69 | .funcs = { { | ||
70 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
71 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } | ||
72 | }, { | ||
73 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
74 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } | ||
75 | }, { | ||
76 | .num_blocks = 2, | ||
77 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } | ||
78 | }, { | ||
79 | .num_blocks = 1, | ||
80 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } | ||
81 | } } | ||
82 | }; | ||
83 | |||
84 | static const struct common_glue_ctx camellia_enc_xts = { | ||
85 | .num_funcs = 3, | ||
86 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
87 | |||
88 | .funcs = { { | ||
89 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
90 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } | ||
91 | }, { | ||
92 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
93 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } | ||
94 | }, { | ||
95 | .num_blocks = 1, | ||
96 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } | ||
97 | } } | ||
98 | }; | ||
99 | |||
100 | static const struct common_glue_ctx camellia_dec = { | ||
101 | .num_funcs = 4, | ||
102 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
103 | |||
104 | .funcs = { { | ||
105 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
106 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } | ||
107 | }, { | ||
108 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
109 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } | ||
110 | }, { | ||
111 | .num_blocks = 2, | ||
112 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } | ||
113 | }, { | ||
114 | .num_blocks = 1, | ||
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } | ||
116 | } } | ||
117 | }; | ||
118 | |||
119 | static const struct common_glue_ctx camellia_dec_cbc = { | ||
120 | .num_funcs = 4, | ||
121 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
122 | |||
123 | .funcs = { { | ||
124 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
125 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } | ||
126 | }, { | ||
127 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
128 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } | ||
129 | }, { | ||
130 | .num_blocks = 2, | ||
131 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } | ||
132 | }, { | ||
133 | .num_blocks = 1, | ||
134 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } | ||
135 | } } | ||
136 | }; | ||
137 | |||
138 | static const struct common_glue_ctx camellia_dec_xts = { | ||
139 | .num_funcs = 3, | ||
140 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
141 | |||
142 | .funcs = { { | ||
143 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
144 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } | ||
145 | }, { | ||
146 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
147 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } | ||
148 | }, { | ||
149 | .num_blocks = 1, | ||
150 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } | ||
151 | } } | ||
152 | }; | ||
153 | |||
154 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
155 | struct scatterlist *src, unsigned int nbytes) | ||
156 | { | ||
157 | return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); | ||
158 | } | ||
159 | |||
160 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
161 | struct scatterlist *src, unsigned int nbytes) | ||
162 | { | ||
163 | return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); | ||
164 | } | ||
165 | |||
166 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
167 | struct scatterlist *src, unsigned int nbytes) | ||
168 | { | ||
169 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, | ||
170 | dst, src, nbytes); | ||
171 | } | ||
172 | |||
173 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
174 | struct scatterlist *src, unsigned int nbytes) | ||
175 | { | ||
176 | return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, | ||
177 | nbytes); | ||
178 | } | ||
179 | |||
180 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
181 | struct scatterlist *src, unsigned int nbytes) | ||
182 | { | ||
183 | return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); | ||
184 | } | ||
185 | |||
186 | static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
187 | { | ||
188 | return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, | ||
189 | CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, | ||
190 | nbytes); | ||
191 | } | ||
192 | |||
193 | static inline void camellia_fpu_end(bool fpu_enabled) | ||
194 | { | ||
195 | glue_fpu_end(fpu_enabled); | ||
196 | } | ||
197 | |||
198 | static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, | ||
199 | unsigned int key_len) | ||
200 | { | ||
201 | return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, | ||
202 | &tfm->crt_flags); | ||
203 | } | ||
204 | |||
205 | struct crypt_priv { | ||
206 | struct camellia_ctx *ctx; | ||
207 | bool fpu_enabled; | ||
208 | }; | ||
209 | |||
210 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
211 | { | ||
212 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
213 | struct crypt_priv *ctx = priv; | ||
214 | int i; | ||
215 | |||
216 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
217 | |||
218 | if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { | ||
219 | camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst); | ||
220 | srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
221 | nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
222 | } | ||
223 | |||
224 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
225 | camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
226 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
227 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
228 | } | ||
229 | |||
230 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
231 | camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); | ||
232 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
233 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
234 | } | ||
235 | |||
236 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
237 | camellia_enc_blk(ctx->ctx, srcdst, srcdst); | ||
238 | } | ||
239 | |||
240 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
241 | { | ||
242 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
243 | struct crypt_priv *ctx = priv; | ||
244 | int i; | ||
245 | |||
246 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
247 | |||
248 | if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { | ||
249 | camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst); | ||
250 | srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
251 | nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
252 | } | ||
253 | |||
254 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
255 | camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
256 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
257 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
258 | } | ||
259 | |||
260 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
261 | camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); | ||
262 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
263 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
264 | } | ||
265 | |||
266 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
267 | camellia_dec_blk(ctx->ctx, srcdst, srcdst); | ||
268 | } | ||
269 | |||
270 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
271 | struct scatterlist *src, unsigned int nbytes) | ||
272 | { | ||
273 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
274 | be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; | ||
275 | struct crypt_priv crypt_ctx = { | ||
276 | .ctx = &ctx->camellia_ctx, | ||
277 | .fpu_enabled = false, | ||
278 | }; | ||
279 | struct lrw_crypt_req req = { | ||
280 | .tbuf = buf, | ||
281 | .tbuflen = sizeof(buf), | ||
282 | |||
283 | .table_ctx = &ctx->lrw_table, | ||
284 | .crypt_ctx = &crypt_ctx, | ||
285 | .crypt_fn = encrypt_callback, | ||
286 | }; | ||
287 | int ret; | ||
288 | |||
289 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
290 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
291 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
292 | |||
293 | return ret; | ||
294 | } | ||
295 | |||
296 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
297 | struct scatterlist *src, unsigned int nbytes) | ||
298 | { | ||
299 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
300 | be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; | ||
301 | struct crypt_priv crypt_ctx = { | ||
302 | .ctx = &ctx->camellia_ctx, | ||
303 | .fpu_enabled = false, | ||
304 | }; | ||
305 | struct lrw_crypt_req req = { | ||
306 | .tbuf = buf, | ||
307 | .tbuflen = sizeof(buf), | ||
308 | |||
309 | .table_ctx = &ctx->lrw_table, | ||
310 | .crypt_ctx = &crypt_ctx, | ||
311 | .crypt_fn = decrypt_callback, | ||
312 | }; | ||
313 | int ret; | ||
314 | |||
315 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
316 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
317 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
318 | |||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
323 | struct scatterlist *src, unsigned int nbytes) | ||
324 | { | ||
325 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
326 | |||
327 | return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, | ||
328 | XTS_TWEAK_CAST(camellia_enc_blk), | ||
329 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
330 | } | ||
331 | |||
332 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
333 | struct scatterlist *src, unsigned int nbytes) | ||
334 | { | ||
335 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
336 | |||
337 | return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, | ||
338 | XTS_TWEAK_CAST(camellia_enc_blk), | ||
339 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
340 | } | ||
341 | |||
342 | static struct crypto_alg cmll_algs[10] = { { | ||
343 | .cra_name = "__ecb-camellia-aesni-avx2", | ||
344 | .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", | ||
345 | .cra_priority = 0, | ||
346 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
347 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
348 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
349 | .cra_alignmask = 0, | ||
350 | .cra_type = &crypto_blkcipher_type, | ||
351 | .cra_module = THIS_MODULE, | ||
352 | .cra_u = { | ||
353 | .blkcipher = { | ||
354 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
355 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
356 | .setkey = camellia_setkey, | ||
357 | .encrypt = ecb_encrypt, | ||
358 | .decrypt = ecb_decrypt, | ||
359 | }, | ||
360 | }, | ||
361 | }, { | ||
362 | .cra_name = "__cbc-camellia-aesni-avx2", | ||
363 | .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", | ||
364 | .cra_priority = 0, | ||
365 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
366 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
367 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
368 | .cra_alignmask = 0, | ||
369 | .cra_type = &crypto_blkcipher_type, | ||
370 | .cra_module = THIS_MODULE, | ||
371 | .cra_u = { | ||
372 | .blkcipher = { | ||
373 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
374 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
375 | .setkey = camellia_setkey, | ||
376 | .encrypt = cbc_encrypt, | ||
377 | .decrypt = cbc_decrypt, | ||
378 | }, | ||
379 | }, | ||
380 | }, { | ||
381 | .cra_name = "__ctr-camellia-aesni-avx2", | ||
382 | .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", | ||
383 | .cra_priority = 0, | ||
384 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
385 | .cra_blocksize = 1, | ||
386 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
387 | .cra_alignmask = 0, | ||
388 | .cra_type = &crypto_blkcipher_type, | ||
389 | .cra_module = THIS_MODULE, | ||
390 | .cra_u = { | ||
391 | .blkcipher = { | ||
392 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
393 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
394 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
395 | .setkey = camellia_setkey, | ||
396 | .encrypt = ctr_crypt, | ||
397 | .decrypt = ctr_crypt, | ||
398 | }, | ||
399 | }, | ||
400 | }, { | ||
401 | .cra_name = "__lrw-camellia-aesni-avx2", | ||
402 | .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", | ||
403 | .cra_priority = 0, | ||
404 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
405 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
406 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), | ||
407 | .cra_alignmask = 0, | ||
408 | .cra_type = &crypto_blkcipher_type, | ||
409 | .cra_module = THIS_MODULE, | ||
410 | .cra_exit = lrw_camellia_exit_tfm, | ||
411 | .cra_u = { | ||
412 | .blkcipher = { | ||
413 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
414 | CAMELLIA_BLOCK_SIZE, | ||
415 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
416 | CAMELLIA_BLOCK_SIZE, | ||
417 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
418 | .setkey = lrw_camellia_setkey, | ||
419 | .encrypt = lrw_encrypt, | ||
420 | .decrypt = lrw_decrypt, | ||
421 | }, | ||
422 | }, | ||
423 | }, { | ||
424 | .cra_name = "__xts-camellia-aesni-avx2", | ||
425 | .cra_driver_name = "__driver-xts-camellia-aesni-avx2", | ||
426 | .cra_priority = 0, | ||
427 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
428 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
429 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), | ||
430 | .cra_alignmask = 0, | ||
431 | .cra_type = &crypto_blkcipher_type, | ||
432 | .cra_module = THIS_MODULE, | ||
433 | .cra_u = { | ||
434 | .blkcipher = { | ||
435 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
436 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
437 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
438 | .setkey = xts_camellia_setkey, | ||
439 | .encrypt = xts_encrypt, | ||
440 | .decrypt = xts_decrypt, | ||
441 | }, | ||
442 | }, | ||
443 | }, { | ||
444 | .cra_name = "ecb(camellia)", | ||
445 | .cra_driver_name = "ecb-camellia-aesni-avx2", | ||
446 | .cra_priority = 500, | ||
447 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
448 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
449 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
450 | .cra_alignmask = 0, | ||
451 | .cra_type = &crypto_ablkcipher_type, | ||
452 | .cra_module = THIS_MODULE, | ||
453 | .cra_init = ablk_init, | ||
454 | .cra_exit = ablk_exit, | ||
455 | .cra_u = { | ||
456 | .ablkcipher = { | ||
457 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
458 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
459 | .setkey = ablk_set_key, | ||
460 | .encrypt = ablk_encrypt, | ||
461 | .decrypt = ablk_decrypt, | ||
462 | }, | ||
463 | }, | ||
464 | }, { | ||
465 | .cra_name = "cbc(camellia)", | ||
466 | .cra_driver_name = "cbc-camellia-aesni-avx2", | ||
467 | .cra_priority = 500, | ||
468 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
469 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
470 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
471 | .cra_alignmask = 0, | ||
472 | .cra_type = &crypto_ablkcipher_type, | ||
473 | .cra_module = THIS_MODULE, | ||
474 | .cra_init = ablk_init, | ||
475 | .cra_exit = ablk_exit, | ||
476 | .cra_u = { | ||
477 | .ablkcipher = { | ||
478 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
479 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
480 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
481 | .setkey = ablk_set_key, | ||
482 | .encrypt = __ablk_encrypt, | ||
483 | .decrypt = ablk_decrypt, | ||
484 | }, | ||
485 | }, | ||
486 | }, { | ||
487 | .cra_name = "ctr(camellia)", | ||
488 | .cra_driver_name = "ctr-camellia-aesni-avx2", | ||
489 | .cra_priority = 500, | ||
490 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
491 | .cra_blocksize = 1, | ||
492 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
493 | .cra_alignmask = 0, | ||
494 | .cra_type = &crypto_ablkcipher_type, | ||
495 | .cra_module = THIS_MODULE, | ||
496 | .cra_init = ablk_init, | ||
497 | .cra_exit = ablk_exit, | ||
498 | .cra_u = { | ||
499 | .ablkcipher = { | ||
500 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
501 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
502 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
503 | .setkey = ablk_set_key, | ||
504 | .encrypt = ablk_encrypt, | ||
505 | .decrypt = ablk_encrypt, | ||
506 | .geniv = "chainiv", | ||
507 | }, | ||
508 | }, | ||
509 | }, { | ||
510 | .cra_name = "lrw(camellia)", | ||
511 | .cra_driver_name = "lrw-camellia-aesni-avx2", | ||
512 | .cra_priority = 500, | ||
513 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
514 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
515 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
516 | .cra_alignmask = 0, | ||
517 | .cra_type = &crypto_ablkcipher_type, | ||
518 | .cra_module = THIS_MODULE, | ||
519 | .cra_init = ablk_init, | ||
520 | .cra_exit = ablk_exit, | ||
521 | .cra_u = { | ||
522 | .ablkcipher = { | ||
523 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
524 | CAMELLIA_BLOCK_SIZE, | ||
525 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
526 | CAMELLIA_BLOCK_SIZE, | ||
527 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
528 | .setkey = ablk_set_key, | ||
529 | .encrypt = ablk_encrypt, | ||
530 | .decrypt = ablk_decrypt, | ||
531 | }, | ||
532 | }, | ||
533 | }, { | ||
534 | .cra_name = "xts(camellia)", | ||
535 | .cra_driver_name = "xts-camellia-aesni-avx2", | ||
536 | .cra_priority = 500, | ||
537 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
538 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
539 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
540 | .cra_alignmask = 0, | ||
541 | .cra_type = &crypto_ablkcipher_type, | ||
542 | .cra_module = THIS_MODULE, | ||
543 | .cra_init = ablk_init, | ||
544 | .cra_exit = ablk_exit, | ||
545 | .cra_u = { | ||
546 | .ablkcipher = { | ||
547 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
548 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
549 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
550 | .setkey = ablk_set_key, | ||
551 | .encrypt = ablk_encrypt, | ||
552 | .decrypt = ablk_decrypt, | ||
553 | }, | ||
554 | }, | ||
555 | } }; | ||
556 | |||
557 | static int __init camellia_aesni_init(void) | ||
558 | { | ||
559 | u64 xcr0; | ||
560 | |||
561 | if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { | ||
562 | pr_info("AVX2 or AES-NI instructions are not detected.\n"); | ||
563 | return -ENODEV; | ||
564 | } | ||
565 | |||
566 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
567 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
568 | pr_info("AVX2 detected but unusable.\n"); | ||
569 | return -ENODEV; | ||
570 | } | ||
571 | |||
572 | return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
573 | } | ||
574 | |||
575 | static void __exit camellia_aesni_fini(void) | ||
576 | { | ||
577 | crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
578 | } | ||
579 | |||
580 | module_init(camellia_aesni_init); | ||
581 | module_exit(camellia_aesni_fini); | ||
582 | |||
583 | MODULE_LICENSE("GPL"); | ||
584 | MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); | ||
585 | MODULE_ALIAS("camellia"); | ||
586 | MODULE_ALIAS("camellia-asm"); | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 96cbb6068fce..37fd0c0a81ea 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia | 2 | * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia |
3 | * | 3 | * |
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -26,16 +26,44 @@ | |||
26 | 26 | ||
27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 | 27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 |
28 | 28 | ||
29 | /* 16-way AES-NI parallel cipher functions */ | 29 | /* 16-way parallel cipher functions (avx/aes-ni) */ |
30 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, | 30 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, |
31 | const u8 *src); | 31 | const u8 *src); |
32 | EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); | ||
33 | |||
32 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, | 34 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, |
33 | const u8 *src); | 35 | const u8 *src); |
36 | EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); | ||
34 | 37 | ||
35 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | 38 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, |
36 | const u8 *src); | 39 | const u8 *src); |
40 | EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); | ||
41 | |||
37 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | 42 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, |
38 | const u8 *src, le128 *iv); | 43 | const u8 *src, le128 *iv); |
44 | EXPORT_SYMBOL_GPL(camellia_ctr_16way); | ||
45 | |||
46 | asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
47 | const u8 *src, le128 *iv); | ||
48 | EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); | ||
49 | |||
50 | asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
51 | const u8 *src, le128 *iv); | ||
52 | EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); | ||
53 | |||
54 | void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
55 | { | ||
56 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
57 | GLUE_FUNC_CAST(camellia_enc_blk)); | ||
58 | } | ||
59 | EXPORT_SYMBOL_GPL(camellia_xts_enc); | ||
60 | |||
61 | void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
62 | { | ||
63 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
64 | GLUE_FUNC_CAST(camellia_dec_blk)); | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(camellia_xts_dec); | ||
39 | 67 | ||
40 | static const struct common_glue_ctx camellia_enc = { | 68 | static const struct common_glue_ctx camellia_enc = { |
41 | .num_funcs = 3, | 69 | .num_funcs = 3, |
@@ -69,6 +97,19 @@ static const struct common_glue_ctx camellia_ctr = { | |||
69 | } } | 97 | } } |
70 | }; | 98 | }; |
71 | 99 | ||
100 | static const struct common_glue_ctx camellia_enc_xts = { | ||
101 | .num_funcs = 2, | ||
102 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
103 | |||
104 | .funcs = { { | ||
105 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
106 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } | ||
107 | }, { | ||
108 | .num_blocks = 1, | ||
109 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } | ||
110 | } } | ||
111 | }; | ||
112 | |||
72 | static const struct common_glue_ctx camellia_dec = { | 113 | static const struct common_glue_ctx camellia_dec = { |
73 | .num_funcs = 3, | 114 | .num_funcs = 3, |
74 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | 115 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, |
@@ -101,6 +142,19 @@ static const struct common_glue_ctx camellia_dec_cbc = { | |||
101 | } } | 142 | } } |
102 | }; | 143 | }; |
103 | 144 | ||
145 | static const struct common_glue_ctx camellia_dec_xts = { | ||
146 | .num_funcs = 2, | ||
147 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
148 | |||
149 | .funcs = { { | ||
150 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
151 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } | ||
152 | }, { | ||
153 | .num_blocks = 1, | ||
154 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } | ||
155 | } } | ||
156 | }; | ||
157 | |||
104 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 158 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
105 | struct scatterlist *src, unsigned int nbytes) | 159 | struct scatterlist *src, unsigned int nbytes) |
106 | { | 160 | { |
@@ -261,54 +315,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
261 | struct scatterlist *src, unsigned int nbytes) | 315 | struct scatterlist *src, unsigned int nbytes) |
262 | { | 316 | { |
263 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 317 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
264 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
265 | struct crypt_priv crypt_ctx = { | ||
266 | .ctx = &ctx->crypt_ctx, | ||
267 | .fpu_enabled = false, | ||
268 | }; | ||
269 | struct xts_crypt_req req = { | ||
270 | .tbuf = buf, | ||
271 | .tbuflen = sizeof(buf), | ||
272 | 318 | ||
273 | .tweak_ctx = &ctx->tweak_ctx, | 319 | return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, |
274 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | 320 | XTS_TWEAK_CAST(camellia_enc_blk), |
275 | .crypt_ctx = &crypt_ctx, | 321 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
276 | .crypt_fn = encrypt_callback, | ||
277 | }; | ||
278 | int ret; | ||
279 | |||
280 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
281 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
282 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
283 | |||
284 | return ret; | ||
285 | } | 322 | } |
286 | 323 | ||
287 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 324 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
288 | struct scatterlist *src, unsigned int nbytes) | 325 | struct scatterlist *src, unsigned int nbytes) |
289 | { | 326 | { |
290 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 327 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
291 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
292 | struct crypt_priv crypt_ctx = { | ||
293 | .ctx = &ctx->crypt_ctx, | ||
294 | .fpu_enabled = false, | ||
295 | }; | ||
296 | struct xts_crypt_req req = { | ||
297 | .tbuf = buf, | ||
298 | .tbuflen = sizeof(buf), | ||
299 | |||
300 | .tweak_ctx = &ctx->tweak_ctx, | ||
301 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | ||
302 | .crypt_ctx = &crypt_ctx, | ||
303 | .crypt_fn = decrypt_callback, | ||
304 | }; | ||
305 | int ret; | ||
306 | 328 | ||
307 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | 329 | return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, |
308 | ret = xts_crypt(desc, dst, src, nbytes, &req); | 330 | XTS_TWEAK_CAST(camellia_enc_blk), |
309 | camellia_fpu_end(crypt_ctx.fpu_enabled); | 331 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
310 | |||
311 | return ret; | ||
312 | } | 332 | } |
313 | 333 | ||
314 | static struct crypto_alg cmll_algs[10] = { { | 334 | static struct crypto_alg cmll_algs[10] = { { |
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index f93b6105a0ce..e3531f833951 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 7 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
8 | * | 8 | * |
9 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
@@ -227,6 +227,8 @@ | |||
227 | .data | 227 | .data |
228 | 228 | ||
229 | .align 16 | 229 | .align 16 |
230 | .Lxts_gf128mul_and_shl1_mask: | ||
231 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
230 | .Lbswap_mask: | 232 | .Lbswap_mask: |
231 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | 233 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
232 | .Lbswap128_mask: | 234 | .Lbswap128_mask: |
@@ -424,3 +426,47 @@ ENTRY(cast6_ctr_8way) | |||
424 | 426 | ||
425 | ret; | 427 | ret; |
426 | ENDPROC(cast6_ctr_8way) | 428 | ENDPROC(cast6_ctr_8way) |
429 | |||
430 | ENTRY(cast6_xts_enc_8way) | ||
431 | /* input: | ||
432 | * %rdi: ctx, CTX | ||
433 | * %rsi: dst | ||
434 | * %rdx: src | ||
435 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
436 | */ | ||
437 | |||
438 | movq %rsi, %r11; | ||
439 | |||
440 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | ||
441 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | ||
442 | RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); | ||
443 | |||
444 | call __cast6_enc_blk8; | ||
445 | |||
446 | /* dst <= regs xor IVs(in dst) */ | ||
447 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
448 | |||
449 | ret; | ||
450 | ENDPROC(cast6_xts_enc_8way) | ||
451 | |||
452 | ENTRY(cast6_xts_dec_8way) | ||
453 | /* input: | ||
454 | * %rdi: ctx, CTX | ||
455 | * %rsi: dst | ||
456 | * %rdx: src | ||
457 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
458 | */ | ||
459 | |||
460 | movq %rsi, %r11; | ||
461 | |||
462 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | ||
463 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | ||
464 | RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); | ||
465 | |||
466 | call __cast6_dec_blk8; | ||
467 | |||
468 | /* dst <= regs xor IVs(in dst) */ | ||
469 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
470 | |||
471 | ret; | ||
472 | ENDPROC(cast6_xts_dec_8way) | ||
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 92f7ca24790a..8d0dfb86a559 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c | |||
@@ -4,6 +4,8 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
8 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or | 11 | * the Free Software Foundation; either version 2 of the License, or |
@@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, | |||
50 | asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, | 52 | asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, |
51 | le128 *iv); | 53 | le128 *iv); |
52 | 54 | ||
55 | asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst, | ||
56 | const u8 *src, le128 *iv); | ||
57 | asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst, | ||
58 | const u8 *src, le128 *iv); | ||
59 | |||
60 | static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
61 | { | ||
62 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
63 | GLUE_FUNC_CAST(__cast6_encrypt)); | ||
64 | } | ||
65 | |||
66 | static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
67 | { | ||
68 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
69 | GLUE_FUNC_CAST(__cast6_decrypt)); | ||
70 | } | ||
71 | |||
53 | static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 72 | static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
54 | { | 73 | { |
55 | be128 ctrblk; | 74 | be128 ctrblk; |
@@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = { | |||
87 | } } | 106 | } } |
88 | }; | 107 | }; |
89 | 108 | ||
109 | static const struct common_glue_ctx cast6_enc_xts = { | ||
110 | .num_funcs = 2, | ||
111 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, | ||
112 | |||
113 | .funcs = { { | ||
114 | .num_blocks = CAST6_PARALLEL_BLOCKS, | ||
115 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) } | ||
116 | }, { | ||
117 | .num_blocks = 1, | ||
118 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) } | ||
119 | } } | ||
120 | }; | ||
121 | |||
90 | static const struct common_glue_ctx cast6_dec = { | 122 | static const struct common_glue_ctx cast6_dec = { |
91 | .num_funcs = 2, | 123 | .num_funcs = 2, |
92 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, | 124 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, |
@@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = { | |||
113 | } } | 145 | } } |
114 | }; | 146 | }; |
115 | 147 | ||
148 | static const struct common_glue_ctx cast6_dec_xts = { | ||
149 | .num_funcs = 2, | ||
150 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, | ||
151 | |||
152 | .funcs = { { | ||
153 | .num_blocks = CAST6_PARALLEL_BLOCKS, | ||
154 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) } | ||
155 | }, { | ||
156 | .num_blocks = 1, | ||
157 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) } | ||
158 | } } | ||
159 | }; | ||
160 | |||
116 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 161 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
117 | struct scatterlist *src, unsigned int nbytes) | 162 | struct scatterlist *src, unsigned int nbytes) |
118 | { | 163 | { |
@@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
307 | struct scatterlist *src, unsigned int nbytes) | 352 | struct scatterlist *src, unsigned int nbytes) |
308 | { | 353 | { |
309 | struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 354 | struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
310 | be128 buf[CAST6_PARALLEL_BLOCKS]; | ||
311 | struct crypt_priv crypt_ctx = { | ||
312 | .ctx = &ctx->crypt_ctx, | ||
313 | .fpu_enabled = false, | ||
314 | }; | ||
315 | struct xts_crypt_req req = { | ||
316 | .tbuf = buf, | ||
317 | .tbuflen = sizeof(buf), | ||
318 | 355 | ||
319 | .tweak_ctx = &ctx->tweak_ctx, | 356 | return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes, |
320 | .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), | 357 | XTS_TWEAK_CAST(__cast6_encrypt), |
321 | .crypt_ctx = &crypt_ctx, | 358 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
322 | .crypt_fn = encrypt_callback, | ||
323 | }; | ||
324 | int ret; | ||
325 | |||
326 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
327 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
328 | cast6_fpu_end(crypt_ctx.fpu_enabled); | ||
329 | |||
330 | return ret; | ||
331 | } | 359 | } |
332 | 360 | ||
333 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 361 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
334 | struct scatterlist *src, unsigned int nbytes) | 362 | struct scatterlist *src, unsigned int nbytes) |
335 | { | 363 | { |
336 | struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 364 | struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
337 | be128 buf[CAST6_PARALLEL_BLOCKS]; | ||
338 | struct crypt_priv crypt_ctx = { | ||
339 | .ctx = &ctx->crypt_ctx, | ||
340 | .fpu_enabled = false, | ||
341 | }; | ||
342 | struct xts_crypt_req req = { | ||
343 | .tbuf = buf, | ||
344 | .tbuflen = sizeof(buf), | ||
345 | |||
346 | .tweak_ctx = &ctx->tweak_ctx, | ||
347 | .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), | ||
348 | .crypt_ctx = &crypt_ctx, | ||
349 | .crypt_fn = decrypt_callback, | ||
350 | }; | ||
351 | int ret; | ||
352 | 365 | ||
353 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | 366 | return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes, |
354 | ret = xts_crypt(desc, dst, src, nbytes, &req); | 367 | XTS_TWEAK_CAST(__cast6_encrypt), |
355 | cast6_fpu_end(crypt_ctx.fpu_enabled); | 368 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
356 | |||
357 | return ret; | ||
358 | } | 369 | } |
359 | 370 | ||
360 | static struct crypto_alg cast6_algs[10] = { { | 371 | static struct crypto_alg cast6_algs[10] = { { |
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index c8335014a044..94c27df8a549 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S | |||
@@ -101,9 +101,8 @@ | |||
101 | * uint crc32_pclmul_le_16(unsigned char const *buffer, | 101 | * uint crc32_pclmul_le_16(unsigned char const *buffer, |
102 | * size_t len, uint crc32) | 102 | * size_t len, uint crc32) |
103 | */ | 103 | */ |
104 | .globl crc32_pclmul_le_16 | 104 | |
105 | .align 4, 0x90 | 105 | ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ |
106 | crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */ | ||
107 | movdqa (BUF), %xmm1 | 106 | movdqa (BUF), %xmm1 |
108 | movdqa 0x10(BUF), %xmm2 | 107 | movdqa 0x10(BUF), %xmm2 |
109 | movdqa 0x20(BUF), %xmm3 | 108 | movdqa 0x20(BUF), %xmm3 |
@@ -244,3 +243,4 @@ fold_64: | |||
244 | pextrd $0x01, %xmm1, %eax | 243 | pextrd $0x01, %xmm1, %eax |
245 | 244 | ||
246 | ret | 245 | ret |
246 | ENDPROC(crc32_pclmul_le_16) | ||
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index cf1a7ec4cc3a..dbc4339b5417 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S | |||
@@ -1,9 +1,10 @@ | |||
1 | /* | 1 | /* |
2 | * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) | 2 | * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) |
3 | * | 3 | * |
4 | * The white paper on CRC32C calculations with PCLMULQDQ instruction can be | 4 | * The white papers on CRC32C calculations with PCLMULQDQ instruction can be |
5 | * downloaded from: | 5 | * downloaded from: |
6 | * http://download.intel.com/design/intarch/papers/323405.pdf | 6 | * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf |
7 | * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf | ||
7 | * | 8 | * |
8 | * Copyright (C) 2012 Intel Corporation. | 9 | * Copyright (C) 2012 Intel Corporation. |
9 | * | 10 | * |
@@ -42,6 +43,7 @@ | |||
42 | * SOFTWARE. | 43 | * SOFTWARE. |
43 | */ | 44 | */ |
44 | 45 | ||
46 | #include <asm/inst.h> | ||
45 | #include <linux/linkage.h> | 47 | #include <linux/linkage.h> |
46 | 48 | ||
47 | ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction | 49 | ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction |
@@ -225,10 +227,10 @@ LABEL crc_ %i | |||
225 | movdqa (bufp), %xmm0 # 2 consts: K1:K2 | 227 | movdqa (bufp), %xmm0 # 2 consts: K1:K2 |
226 | 228 | ||
227 | movq crc_init, %xmm1 # CRC for block 1 | 229 | movq crc_init, %xmm1 # CRC for block 1 |
228 | pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 | 230 | PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 |
229 | 231 | ||
230 | movq crc1, %xmm2 # CRC for block 2 | 232 | movq crc1, %xmm2 # CRC for block 2 |
231 | pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 | 233 | PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 |
232 | 234 | ||
233 | pxor %xmm2,%xmm1 | 235 | pxor %xmm2,%xmm1 |
234 | movq %xmm1, %rax | 236 | movq %xmm1, %rax |
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S index f7b6ea2ddfdb..02ee2308fb38 100644 --- a/arch/x86/crypto/glue_helper-asm-avx.S +++ b/arch/x86/crypto/glue_helper-asm-avx.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Shared glue code for 128bit block ciphers, AVX assembler macros | 2 | * Shared glue code for 128bit block ciphers, AVX assembler macros |
3 | * | 3 | * |
4 | * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -89,3 +89,62 @@ | |||
89 | vpxor (6*16)(src), x6, x6; \ | 89 | vpxor (6*16)(src), x6, x6; \ |
90 | vpxor (7*16)(src), x7, x7; \ | 90 | vpxor (7*16)(src), x7, x7; \ |
91 | store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | 91 | store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); |
92 | |||
93 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
94 | vpsrad $31, iv, tmp; \ | ||
95 | vpaddq iv, iv, iv; \ | ||
96 | vpshufd $0x13, tmp, tmp; \ | ||
97 | vpand mask, tmp, tmp; \ | ||
98 | vpxor tmp, iv, iv; | ||
99 | |||
100 | #define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ | ||
101 | t1, xts_gf128mul_and_shl1_mask) \ | ||
102 | vmovdqa xts_gf128mul_and_shl1_mask, t0; \ | ||
103 | \ | ||
104 | /* load IV */ \ | ||
105 | vmovdqu (iv), tiv; \ | ||
106 | vpxor (0*16)(src), tiv, x0; \ | ||
107 | vmovdqu tiv, (0*16)(dst); \ | ||
108 | \ | ||
109 | /* construct and store IVs, also xor with source */ \ | ||
110 | gf128mul_x_ble(tiv, t0, t1); \ | ||
111 | vpxor (1*16)(src), tiv, x1; \ | ||
112 | vmovdqu tiv, (1*16)(dst); \ | ||
113 | \ | ||
114 | gf128mul_x_ble(tiv, t0, t1); \ | ||
115 | vpxor (2*16)(src), tiv, x2; \ | ||
116 | vmovdqu tiv, (2*16)(dst); \ | ||
117 | \ | ||
118 | gf128mul_x_ble(tiv, t0, t1); \ | ||
119 | vpxor (3*16)(src), tiv, x3; \ | ||
120 | vmovdqu tiv, (3*16)(dst); \ | ||
121 | \ | ||
122 | gf128mul_x_ble(tiv, t0, t1); \ | ||
123 | vpxor (4*16)(src), tiv, x4; \ | ||
124 | vmovdqu tiv, (4*16)(dst); \ | ||
125 | \ | ||
126 | gf128mul_x_ble(tiv, t0, t1); \ | ||
127 | vpxor (5*16)(src), tiv, x5; \ | ||
128 | vmovdqu tiv, (5*16)(dst); \ | ||
129 | \ | ||
130 | gf128mul_x_ble(tiv, t0, t1); \ | ||
131 | vpxor (6*16)(src), tiv, x6; \ | ||
132 | vmovdqu tiv, (6*16)(dst); \ | ||
133 | \ | ||
134 | gf128mul_x_ble(tiv, t0, t1); \ | ||
135 | vpxor (7*16)(src), tiv, x7; \ | ||
136 | vmovdqu tiv, (7*16)(dst); \ | ||
137 | \ | ||
138 | gf128mul_x_ble(tiv, t0, t1); \ | ||
139 | vmovdqu tiv, (iv); | ||
140 | |||
141 | #define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
142 | vpxor (0*16)(dst), x0, x0; \ | ||
143 | vpxor (1*16)(dst), x1, x1; \ | ||
144 | vpxor (2*16)(dst), x2, x2; \ | ||
145 | vpxor (3*16)(dst), x3, x3; \ | ||
146 | vpxor (4*16)(dst), x4, x4; \ | ||
147 | vpxor (5*16)(dst), x5, x5; \ | ||
148 | vpxor (6*16)(dst), x6, x6; \ | ||
149 | vpxor (7*16)(dst), x7, x7; \ | ||
150 | store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | ||
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S new file mode 100644 index 000000000000..a53ac11dd385 --- /dev/null +++ b/arch/x86/crypto/glue_helper-asm-avx2.S | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | * Shared glue code for 128bit block ciphers, AVX2 assembler macros | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
14 | vmovdqu (0*32)(src), x0; \ | ||
15 | vmovdqu (1*32)(src), x1; \ | ||
16 | vmovdqu (2*32)(src), x2; \ | ||
17 | vmovdqu (3*32)(src), x3; \ | ||
18 | vmovdqu (4*32)(src), x4; \ | ||
19 | vmovdqu (5*32)(src), x5; \ | ||
20 | vmovdqu (6*32)(src), x6; \ | ||
21 | vmovdqu (7*32)(src), x7; | ||
22 | |||
23 | #define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
24 | vmovdqu x0, (0*32)(dst); \ | ||
25 | vmovdqu x1, (1*32)(dst); \ | ||
26 | vmovdqu x2, (2*32)(dst); \ | ||
27 | vmovdqu x3, (3*32)(dst); \ | ||
28 | vmovdqu x4, (4*32)(dst); \ | ||
29 | vmovdqu x5, (5*32)(dst); \ | ||
30 | vmovdqu x6, (6*32)(dst); \ | ||
31 | vmovdqu x7, (7*32)(dst); | ||
32 | |||
33 | #define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \ | ||
34 | vpxor t0, t0, t0; \ | ||
35 | vinserti128 $1, (src), t0, t0; \ | ||
36 | vpxor t0, x0, x0; \ | ||
37 | vpxor (0*32+16)(src), x1, x1; \ | ||
38 | vpxor (1*32+16)(src), x2, x2; \ | ||
39 | vpxor (2*32+16)(src), x3, x3; \ | ||
40 | vpxor (3*32+16)(src), x4, x4; \ | ||
41 | vpxor (4*32+16)(src), x5, x5; \ | ||
42 | vpxor (5*32+16)(src), x6, x6; \ | ||
43 | vpxor (6*32+16)(src), x7, x7; \ | ||
44 | store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | ||
45 | |||
46 | #define inc_le128(x, minus_one, tmp) \ | ||
47 | vpcmpeqq minus_one, x, tmp; \ | ||
48 | vpsubq minus_one, x, x; \ | ||
49 | vpslldq $8, tmp, tmp; \ | ||
50 | vpsubq tmp, x, x; | ||
51 | |||
52 | #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ | ||
53 | vpcmpeqq minus_one, x, tmp1; \ | ||
54 | vpcmpeqq minus_two, x, tmp2; \ | ||
55 | vpsubq minus_two, x, x; \ | ||
56 | vpor tmp2, tmp1, tmp1; \ | ||
57 | vpslldq $8, tmp1, tmp1; \ | ||
58 | vpsubq tmp1, x, x; | ||
59 | |||
60 | #define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ | ||
61 | t1x, t2, t2x, t3, t3x, t4, t5) \ | ||
62 | vpcmpeqd t0, t0, t0; \ | ||
63 | vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ | ||
64 | vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ | ||
65 | \ | ||
66 | /* load IV and byteswap */ \ | ||
67 | vmovdqu (iv), t2x; \ | ||
68 | vmovdqa t2x, t3x; \ | ||
69 | inc_le128(t2x, t0x, t1x); \ | ||
70 | vbroadcasti128 bswap, t1; \ | ||
71 | vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ | ||
72 | vpshufb t1, t2, x0; \ | ||
73 | \ | ||
74 | /* construct IVs */ \ | ||
75 | add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ | ||
76 | vpshufb t1, t2, x1; \ | ||
77 | add2_le128(t2, t0, t4, t3, t5); \ | ||
78 | vpshufb t1, t2, x2; \ | ||
79 | add2_le128(t2, t0, t4, t3, t5); \ | ||
80 | vpshufb t1, t2, x3; \ | ||
81 | add2_le128(t2, t0, t4, t3, t5); \ | ||
82 | vpshufb t1, t2, x4; \ | ||
83 | add2_le128(t2, t0, t4, t3, t5); \ | ||
84 | vpshufb t1, t2, x5; \ | ||
85 | add2_le128(t2, t0, t4, t3, t5); \ | ||
86 | vpshufb t1, t2, x6; \ | ||
87 | add2_le128(t2, t0, t4, t3, t5); \ | ||
88 | vpshufb t1, t2, x7; \ | ||
89 | vextracti128 $1, t2, t2x; \ | ||
90 | inc_le128(t2x, t0x, t3x); \ | ||
91 | vmovdqu t2x, (iv); | ||
92 | |||
93 | #define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
94 | vpxor (0*32)(src), x0, x0; \ | ||
95 | vpxor (1*32)(src), x1, x1; \ | ||
96 | vpxor (2*32)(src), x2, x2; \ | ||
97 | vpxor (3*32)(src), x3, x3; \ | ||
98 | vpxor (4*32)(src), x4, x4; \ | ||
99 | vpxor (5*32)(src), x5, x5; \ | ||
100 | vpxor (6*32)(src), x6, x6; \ | ||
101 | vpxor (7*32)(src), x7, x7; \ | ||
102 | store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | ||
103 | |||
104 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
105 | vpsrad $31, iv, tmp; \ | ||
106 | vpaddq iv, iv, iv; \ | ||
107 | vpshufd $0x13, tmp, tmp; \ | ||
108 | vpand mask, tmp, tmp; \ | ||
109 | vpxor tmp, iv, iv; | ||
110 | |||
111 | #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ | ||
112 | vpsrad $31, iv, tmp0; \ | ||
113 | vpaddq iv, iv, tmp1; \ | ||
114 | vpsllq $2, iv, iv; \ | ||
115 | vpshufd $0x13, tmp0, tmp0; \ | ||
116 | vpsrad $31, tmp1, tmp1; \ | ||
117 | vpand mask2, tmp0, tmp0; \ | ||
118 | vpshufd $0x13, tmp1, tmp1; \ | ||
119 | vpxor tmp0, iv, iv; \ | ||
120 | vpand mask1, tmp1, tmp1; \ | ||
121 | vpxor tmp1, iv, iv; | ||
122 | |||
123 | #define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ | ||
124 | tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ | ||
125 | xts_gf128mul_and_shl1_mask_0, \ | ||
126 | xts_gf128mul_and_shl1_mask_1) \ | ||
127 | vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ | ||
128 | \ | ||
129 | /* load IV and construct second IV */ \ | ||
130 | vmovdqu (iv), tivx; \ | ||
131 | vmovdqa tivx, t0x; \ | ||
132 | gf128mul_x_ble(tivx, t1x, t2x); \ | ||
133 | vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ | ||
134 | vinserti128 $1, tivx, t0, tiv; \ | ||
135 | vpxor (0*32)(src), tiv, x0; \ | ||
136 | vmovdqu tiv, (0*32)(dst); \ | ||
137 | \ | ||
138 | /* construct and store IVs, also xor with source */ \ | ||
139 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
140 | vpxor (1*32)(src), tiv, x1; \ | ||
141 | vmovdqu tiv, (1*32)(dst); \ | ||
142 | \ | ||
143 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
144 | vpxor (2*32)(src), tiv, x2; \ | ||
145 | vmovdqu tiv, (2*32)(dst); \ | ||
146 | \ | ||
147 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
148 | vpxor (3*32)(src), tiv, x3; \ | ||
149 | vmovdqu tiv, (3*32)(dst); \ | ||
150 | \ | ||
151 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
152 | vpxor (4*32)(src), tiv, x4; \ | ||
153 | vmovdqu tiv, (4*32)(dst); \ | ||
154 | \ | ||
155 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
156 | vpxor (5*32)(src), tiv, x5; \ | ||
157 | vmovdqu tiv, (5*32)(dst); \ | ||
158 | \ | ||
159 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
160 | vpxor (6*32)(src), tiv, x6; \ | ||
161 | vmovdqu tiv, (6*32)(dst); \ | ||
162 | \ | ||
163 | gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ | ||
164 | vpxor (7*32)(src), tiv, x7; \ | ||
165 | vmovdqu tiv, (7*32)(dst); \ | ||
166 | \ | ||
167 | vextracti128 $1, tiv, tivx; \ | ||
168 | gf128mul_x_ble(tivx, t1x, t2x); \ | ||
169 | vmovdqu tivx, (iv); | ||
170 | |||
171 | #define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
172 | vpxor (0*32)(dst), x0, x0; \ | ||
173 | vpxor (1*32)(dst), x1, x1; \ | ||
174 | vpxor (2*32)(dst), x2, x2; \ | ||
175 | vpxor (3*32)(dst), x3, x3; \ | ||
176 | vpxor (4*32)(dst), x4, x4; \ | ||
177 | vpxor (5*32)(dst), x5, x5; \ | ||
178 | vpxor (6*32)(dst), x6, x6; \ | ||
179 | vpxor (7*32)(dst), x7, x7; \ | ||
180 | store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | ||
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 22ce4f683e55..432f1d76ceb8 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Shared glue code for 128bit block ciphers | 2 | * Shared glue code for 128bit block ciphers |
3 | * | 3 | * |
4 | * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
5 | * | 5 | * |
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | 6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: |
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | 7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> |
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | |||
304 | } | 304 | } |
305 | EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); | 305 | EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); |
306 | 306 | ||
307 | static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, | ||
308 | void *ctx, | ||
309 | struct blkcipher_desc *desc, | ||
310 | struct blkcipher_walk *walk) | ||
311 | { | ||
312 | const unsigned int bsize = 128 / 8; | ||
313 | unsigned int nbytes = walk->nbytes; | ||
314 | u128 *src = (u128 *)walk->src.virt.addr; | ||
315 | u128 *dst = (u128 *)walk->dst.virt.addr; | ||
316 | unsigned int num_blocks, func_bytes; | ||
317 | unsigned int i; | ||
318 | |||
319 | /* Process multi-block batch */ | ||
320 | for (i = 0; i < gctx->num_funcs; i++) { | ||
321 | num_blocks = gctx->funcs[i].num_blocks; | ||
322 | func_bytes = bsize * num_blocks; | ||
323 | |||
324 | if (nbytes >= func_bytes) { | ||
325 | do { | ||
326 | gctx->funcs[i].fn_u.xts(ctx, dst, src, | ||
327 | (le128 *)walk->iv); | ||
328 | |||
329 | src += num_blocks; | ||
330 | dst += num_blocks; | ||
331 | nbytes -= func_bytes; | ||
332 | } while (nbytes >= func_bytes); | ||
333 | |||
334 | if (nbytes < bsize) | ||
335 | goto done; | ||
336 | } | ||
337 | } | ||
338 | |||
339 | done: | ||
340 | return nbytes; | ||
341 | } | ||
342 | |||
343 | /* for implementations implementing faster XTS IV generator */ | ||
344 | int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, | ||
345 | struct blkcipher_desc *desc, struct scatterlist *dst, | ||
346 | struct scatterlist *src, unsigned int nbytes, | ||
347 | void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src), | ||
348 | void *tweak_ctx, void *crypt_ctx) | ||
349 | { | ||
350 | const unsigned int bsize = 128 / 8; | ||
351 | bool fpu_enabled = false; | ||
352 | struct blkcipher_walk walk; | ||
353 | int err; | ||
354 | |||
355 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
356 | |||
357 | err = blkcipher_walk_virt(desc, &walk); | ||
358 | nbytes = walk.nbytes; | ||
359 | if (!nbytes) | ||
360 | return err; | ||
361 | |||
362 | /* set minimum length to bsize, for tweak_fn */ | ||
363 | fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, | ||
364 | desc, fpu_enabled, | ||
365 | nbytes < bsize ? bsize : nbytes); | ||
366 | |||
367 | /* calculate first value of T */ | ||
368 | tweak_fn(tweak_ctx, walk.iv, walk.iv); | ||
369 | |||
370 | while (nbytes) { | ||
371 | nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); | ||
372 | |||
373 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
374 | nbytes = walk.nbytes; | ||
375 | } | ||
376 | |||
377 | glue_fpu_end(fpu_enabled); | ||
378 | |||
379 | return err; | ||
380 | } | ||
381 | EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); | ||
382 | |||
383 | void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, | ||
384 | common_glue_func_t fn) | ||
385 | { | ||
386 | le128 ivblk = *iv; | ||
387 | |||
388 | /* generate next IV */ | ||
389 | le128_gf128mul_x_ble(iv, &ivblk); | ||
390 | |||
391 | /* CC <- T xor C */ | ||
392 | u128_xor(dst, src, (u128 *)&ivblk); | ||
393 | |||
394 | /* PP <- D(Key2,CC) */ | ||
395 | fn(ctx, (u8 *)dst, (u8 *)dst); | ||
396 | |||
397 | /* P <- T xor PP */ | ||
398 | u128_xor(dst, dst, (u128 *)&ivblk); | ||
399 | } | ||
400 | EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); | ||
401 | |||
307 | MODULE_LICENSE("GPL"); | 402 | MODULE_LICENSE("GPL"); |
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 43c938612b74..2f202f49872b 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S | |||
@@ -4,8 +4,7 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by | 7 | * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
8 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
9 | * | 8 | * |
10 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
@@ -34,6 +33,8 @@ | |||
34 | 33 | ||
35 | .Lbswap128_mask: | 34 | .Lbswap128_mask: |
36 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
36 | .Lxts_gf128mul_and_shl1_mask: | ||
37 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
37 | 38 | ||
38 | .text | 39 | .text |
39 | 40 | ||
@@ -739,3 +740,43 @@ ENTRY(serpent_ctr_8way_avx) | |||
739 | 740 | ||
740 | ret; | 741 | ret; |
741 | ENDPROC(serpent_ctr_8way_avx) | 742 | ENDPROC(serpent_ctr_8way_avx) |
743 | |||
744 | ENTRY(serpent_xts_enc_8way_avx) | ||
745 | /* input: | ||
746 | * %rdi: ctx, CTX | ||
747 | * %rsi: dst | ||
748 | * %rdx: src | ||
749 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
750 | */ | ||
751 | |||
752 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | ||
753 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | ||
754 | RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); | ||
755 | |||
756 | call __serpent_enc_blk8_avx; | ||
757 | |||
758 | /* dst <= regs xor IVs(in dst) */ | ||
759 | store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
760 | |||
761 | ret; | ||
762 | ENDPROC(serpent_xts_enc_8way_avx) | ||
763 | |||
764 | ENTRY(serpent_xts_dec_8way_avx) | ||
765 | /* input: | ||
766 | * %rdi: ctx, CTX | ||
767 | * %rsi: dst | ||
768 | * %rdx: src | ||
769 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
770 | */ | ||
771 | |||
772 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | ||
773 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | ||
774 | RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); | ||
775 | |||
776 | call __serpent_dec_blk8_avx; | ||
777 | |||
778 | /* dst <= regs xor IVs(in dst) */ | ||
779 | store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
780 | |||
781 | ret; | ||
782 | ENDPROC(serpent_xts_dec_8way_avx) | ||
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S new file mode 100644 index 000000000000..b222085cccac --- /dev/null +++ b/arch/x86/crypto/serpent-avx2-asm_64.S | |||
@@ -0,0 +1,800 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2 assembler optimized version of Serpent | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * Based on AVX assembler implementation of Serpent by: | ||
7 | * Copyright © 2012 Johannes Goetzfried | ||
8 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or | ||
13 | * (at your option) any later version. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/linkage.h> | ||
18 | #include "glue_helper-asm-avx2.S" | ||
19 | |||
20 | .file "serpent-avx2-asm_64.S" | ||
21 | |||
22 | .data | ||
23 | .align 16 | ||
24 | |||
25 | .Lbswap128_mask: | ||
26 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
27 | .Lxts_gf128mul_and_shl1_mask_0: | ||
28 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
29 | .Lxts_gf128mul_and_shl1_mask_1: | ||
30 | .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 | ||
31 | |||
32 | .text | ||
33 | |||
34 | #define CTX %rdi | ||
35 | |||
36 | #define RNOT %ymm0 | ||
37 | #define tp %ymm1 | ||
38 | |||
39 | #define RA1 %ymm2 | ||
40 | #define RA2 %ymm3 | ||
41 | #define RB1 %ymm4 | ||
42 | #define RB2 %ymm5 | ||
43 | #define RC1 %ymm6 | ||
44 | #define RC2 %ymm7 | ||
45 | #define RD1 %ymm8 | ||
46 | #define RD2 %ymm9 | ||
47 | #define RE1 %ymm10 | ||
48 | #define RE2 %ymm11 | ||
49 | |||
50 | #define RK0 %ymm12 | ||
51 | #define RK1 %ymm13 | ||
52 | #define RK2 %ymm14 | ||
53 | #define RK3 %ymm15 | ||
54 | |||
55 | #define RK0x %xmm12 | ||
56 | #define RK1x %xmm13 | ||
57 | #define RK2x %xmm14 | ||
58 | #define RK3x %xmm15 | ||
59 | |||
60 | #define S0_1(x0, x1, x2, x3, x4) \ | ||
61 | vpor x0, x3, tp; \ | ||
62 | vpxor x3, x0, x0; \ | ||
63 | vpxor x2, x3, x4; \ | ||
64 | vpxor RNOT, x4, x4; \ | ||
65 | vpxor x1, tp, x3; \ | ||
66 | vpand x0, x1, x1; \ | ||
67 | vpxor x4, x1, x1; \ | ||
68 | vpxor x0, x2, x2; | ||
69 | #define S0_2(x0, x1, x2, x3, x4) \ | ||
70 | vpxor x3, x0, x0; \ | ||
71 | vpor x0, x4, x4; \ | ||
72 | vpxor x2, x0, x0; \ | ||
73 | vpand x1, x2, x2; \ | ||
74 | vpxor x2, x3, x3; \ | ||
75 | vpxor RNOT, x1, x1; \ | ||
76 | vpxor x4, x2, x2; \ | ||
77 | vpxor x2, x1, x1; | ||
78 | |||
79 | #define S1_1(x0, x1, x2, x3, x4) \ | ||
80 | vpxor x0, x1, tp; \ | ||
81 | vpxor x3, x0, x0; \ | ||
82 | vpxor RNOT, x3, x3; \ | ||
83 | vpand tp, x1, x4; \ | ||
84 | vpor tp, x0, x0; \ | ||
85 | vpxor x2, x3, x3; \ | ||
86 | vpxor x3, x0, x0; \ | ||
87 | vpxor x3, tp, x1; | ||
88 | #define S1_2(x0, x1, x2, x3, x4) \ | ||
89 | vpxor x4, x3, x3; \ | ||
90 | vpor x4, x1, x1; \ | ||
91 | vpxor x2, x4, x4; \ | ||
92 | vpand x0, x2, x2; \ | ||
93 | vpxor x1, x2, x2; \ | ||
94 | vpor x0, x1, x1; \ | ||
95 | vpxor RNOT, x0, x0; \ | ||
96 | vpxor x2, x0, x0; \ | ||
97 | vpxor x1, x4, x4; | ||
98 | |||
99 | #define S2_1(x0, x1, x2, x3, x4) \ | ||
100 | vpxor RNOT, x3, x3; \ | ||
101 | vpxor x0, x1, x1; \ | ||
102 | vpand x2, x0, tp; \ | ||
103 | vpxor x3, tp, tp; \ | ||
104 | vpor x0, x3, x3; \ | ||
105 | vpxor x1, x2, x2; \ | ||
106 | vpxor x1, x3, x3; \ | ||
107 | vpand tp, x1, x1; | ||
108 | #define S2_2(x0, x1, x2, x3, x4) \ | ||
109 | vpxor x2, tp, tp; \ | ||
110 | vpand x3, x2, x2; \ | ||
111 | vpor x1, x3, x3; \ | ||
112 | vpxor RNOT, tp, tp; \ | ||
113 | vpxor tp, x3, x3; \ | ||
114 | vpxor tp, x0, x4; \ | ||
115 | vpxor x2, tp, x0; \ | ||
116 | vpor x2, x1, x1; | ||
117 | |||
118 | #define S3_1(x0, x1, x2, x3, x4) \ | ||
119 | vpxor x3, x1, tp; \ | ||
120 | vpor x0, x3, x3; \ | ||
121 | vpand x0, x1, x4; \ | ||
122 | vpxor x2, x0, x0; \ | ||
123 | vpxor tp, x2, x2; \ | ||
124 | vpand x3, tp, x1; \ | ||
125 | vpxor x3, x2, x2; \ | ||
126 | vpor x4, x0, x0; \ | ||
127 | vpxor x3, x4, x4; | ||
128 | #define S3_2(x0, x1, x2, x3, x4) \ | ||
129 | vpxor x0, x1, x1; \ | ||
130 | vpand x3, x0, x0; \ | ||
131 | vpand x4, x3, x3; \ | ||
132 | vpxor x2, x3, x3; \ | ||
133 | vpor x1, x4, x4; \ | ||
134 | vpand x1, x2, x2; \ | ||
135 | vpxor x3, x4, x4; \ | ||
136 | vpxor x3, x0, x0; \ | ||
137 | vpxor x2, x3, x3; | ||
138 | |||
139 | #define S4_1(x0, x1, x2, x3, x4) \ | ||
140 | vpand x0, x3, tp; \ | ||
141 | vpxor x3, x0, x0; \ | ||
142 | vpxor x2, tp, tp; \ | ||
143 | vpor x3, x2, x2; \ | ||
144 | vpxor x1, x0, x0; \ | ||
145 | vpxor tp, x3, x4; \ | ||
146 | vpor x0, x2, x2; \ | ||
147 | vpxor x1, x2, x2; | ||
148 | #define S4_2(x0, x1, x2, x3, x4) \ | ||
149 | vpand x0, x1, x1; \ | ||
150 | vpxor x4, x1, x1; \ | ||
151 | vpand x2, x4, x4; \ | ||
152 | vpxor tp, x2, x2; \ | ||
153 | vpxor x0, x4, x4; \ | ||
154 | vpor x1, tp, x3; \ | ||
155 | vpxor RNOT, x1, x1; \ | ||
156 | vpxor x0, x3, x3; | ||
157 | |||
158 | #define S5_1(x0, x1, x2, x3, x4) \ | ||
159 | vpor x0, x1, tp; \ | ||
160 | vpxor tp, x2, x2; \ | ||
161 | vpxor RNOT, x3, x3; \ | ||
162 | vpxor x0, x1, x4; \ | ||
163 | vpxor x2, x0, x0; \ | ||
164 | vpand x4, tp, x1; \ | ||
165 | vpor x3, x4, x4; \ | ||
166 | vpxor x0, x4, x4; | ||
167 | #define S5_2(x0, x1, x2, x3, x4) \ | ||
168 | vpand x3, x0, x0; \ | ||
169 | vpxor x3, x1, x1; \ | ||
170 | vpxor x2, x3, x3; \ | ||
171 | vpxor x1, x0, x0; \ | ||
172 | vpand x4, x2, x2; \ | ||
173 | vpxor x2, x1, x1; \ | ||
174 | vpand x0, x2, x2; \ | ||
175 | vpxor x2, x3, x3; | ||
176 | |||
177 | #define S6_1(x0, x1, x2, x3, x4) \ | ||
178 | vpxor x0, x3, x3; \ | ||
179 | vpxor x2, x1, tp; \ | ||
180 | vpxor x0, x2, x2; \ | ||
181 | vpand x3, x0, x0; \ | ||
182 | vpor x3, tp, tp; \ | ||
183 | vpxor RNOT, x1, x4; \ | ||
184 | vpxor tp, x0, x0; \ | ||
185 | vpxor x2, tp, x1; | ||
186 | #define S6_2(x0, x1, x2, x3, x4) \ | ||
187 | vpxor x4, x3, x3; \ | ||
188 | vpxor x0, x4, x4; \ | ||
189 | vpand x0, x2, x2; \ | ||
190 | vpxor x1, x4, x4; \ | ||
191 | vpxor x3, x2, x2; \ | ||
192 | vpand x1, x3, x3; \ | ||
193 | vpxor x0, x3, x3; \ | ||
194 | vpxor x2, x1, x1; | ||
195 | |||
196 | #define S7_1(x0, x1, x2, x3, x4) \ | ||
197 | vpxor RNOT, x1, tp; \ | ||
198 | vpxor RNOT, x0, x0; \ | ||
199 | vpand x2, tp, x1; \ | ||
200 | vpxor x3, x1, x1; \ | ||
201 | vpor tp, x3, x3; \ | ||
202 | vpxor x2, tp, x4; \ | ||
203 | vpxor x3, x2, x2; \ | ||
204 | vpxor x0, x3, x3; \ | ||
205 | vpor x1, x0, x0; | ||
206 | #define S7_2(x0, x1, x2, x3, x4) \ | ||
207 | vpand x0, x2, x2; \ | ||
208 | vpxor x4, x0, x0; \ | ||
209 | vpxor x3, x4, x4; \ | ||
210 | vpand x0, x3, x3; \ | ||
211 | vpxor x1, x4, x4; \ | ||
212 | vpxor x4, x2, x2; \ | ||
213 | vpxor x1, x3, x3; \ | ||
214 | vpor x0, x4, x4; \ | ||
215 | vpxor x1, x4, x4; | ||
216 | |||
217 | #define SI0_1(x0, x1, x2, x3, x4) \ | ||
218 | vpxor x0, x1, x1; \ | ||
219 | vpor x1, x3, tp; \ | ||
220 | vpxor x1, x3, x4; \ | ||
221 | vpxor RNOT, x0, x0; \ | ||
222 | vpxor tp, x2, x2; \ | ||
223 | vpxor x0, tp, x3; \ | ||
224 | vpand x1, x0, x0; \ | ||
225 | vpxor x2, x0, x0; | ||
226 | #define SI0_2(x0, x1, x2, x3, x4) \ | ||
227 | vpand x3, x2, x2; \ | ||
228 | vpxor x4, x3, x3; \ | ||
229 | vpxor x3, x2, x2; \ | ||
230 | vpxor x3, x1, x1; \ | ||
231 | vpand x0, x3, x3; \ | ||
232 | vpxor x0, x1, x1; \ | ||
233 | vpxor x2, x0, x0; \ | ||
234 | vpxor x3, x4, x4; | ||
235 | |||
236 | #define SI1_1(x0, x1, x2, x3, x4) \ | ||
237 | vpxor x3, x1, x1; \ | ||
238 | vpxor x2, x0, tp; \ | ||
239 | vpxor RNOT, x2, x2; \ | ||
240 | vpor x1, x0, x4; \ | ||
241 | vpxor x3, x4, x4; \ | ||
242 | vpand x1, x3, x3; \ | ||
243 | vpxor x2, x1, x1; \ | ||
244 | vpand x4, x2, x2; | ||
245 | #define SI1_2(x0, x1, x2, x3, x4) \ | ||
246 | vpxor x1, x4, x4; \ | ||
247 | vpor x3, x1, x1; \ | ||
248 | vpxor tp, x3, x3; \ | ||
249 | vpxor tp, x2, x2; \ | ||
250 | vpor x4, tp, x0; \ | ||
251 | vpxor x4, x2, x2; \ | ||
252 | vpxor x0, x1, x1; \ | ||
253 | vpxor x1, x4, x4; | ||
254 | |||
255 | #define SI2_1(x0, x1, x2, x3, x4) \ | ||
256 | vpxor x1, x2, x2; \ | ||
257 | vpxor RNOT, x3, tp; \ | ||
258 | vpor x2, tp, tp; \ | ||
259 | vpxor x3, x2, x2; \ | ||
260 | vpxor x0, x3, x4; \ | ||
261 | vpxor x1, tp, x3; \ | ||
262 | vpor x2, x1, x1; \ | ||
263 | vpxor x0, x2, x2; | ||
264 | #define SI2_2(x0, x1, x2, x3, x4) \ | ||
265 | vpxor x4, x1, x1; \ | ||
266 | vpor x3, x4, x4; \ | ||
267 | vpxor x3, x2, x2; \ | ||
268 | vpxor x2, x4, x4; \ | ||
269 | vpand x1, x2, x2; \ | ||
270 | vpxor x3, x2, x2; \ | ||
271 | vpxor x4, x3, x3; \ | ||
272 | vpxor x0, x4, x4; | ||
273 | |||
274 | #define SI3_1(x0, x1, x2, x3, x4) \ | ||
275 | vpxor x1, x2, x2; \ | ||
276 | vpand x2, x1, tp; \ | ||
277 | vpxor x0, tp, tp; \ | ||
278 | vpor x1, x0, x0; \ | ||
279 | vpxor x3, x1, x4; \ | ||
280 | vpxor x3, x0, x0; \ | ||
281 | vpor tp, x3, x3; \ | ||
282 | vpxor x2, tp, x1; | ||
283 | #define SI3_2(x0, x1, x2, x3, x4) \ | ||
284 | vpxor x3, x1, x1; \ | ||
285 | vpxor x2, x0, x0; \ | ||
286 | vpxor x3, x2, x2; \ | ||
287 | vpand x1, x3, x3; \ | ||
288 | vpxor x0, x1, x1; \ | ||
289 | vpand x2, x0, x0; \ | ||
290 | vpxor x3, x4, x4; \ | ||
291 | vpxor x0, x3, x3; \ | ||
292 | vpxor x1, x0, x0; | ||
293 | |||
294 | #define SI4_1(x0, x1, x2, x3, x4) \ | ||
295 | vpxor x3, x2, x2; \ | ||
296 | vpand x1, x0, tp; \ | ||
297 | vpxor x2, tp, tp; \ | ||
298 | vpor x3, x2, x2; \ | ||
299 | vpxor RNOT, x0, x4; \ | ||
300 | vpxor tp, x1, x1; \ | ||
301 | vpxor x2, tp, x0; \ | ||
302 | vpand x4, x2, x2; | ||
303 | #define SI4_2(x0, x1, x2, x3, x4) \ | ||
304 | vpxor x0, x2, x2; \ | ||
305 | vpor x4, x0, x0; \ | ||
306 | vpxor x3, x0, x0; \ | ||
307 | vpand x2, x3, x3; \ | ||
308 | vpxor x3, x4, x4; \ | ||
309 | vpxor x1, x3, x3; \ | ||
310 | vpand x0, x1, x1; \ | ||
311 | vpxor x1, x4, x4; \ | ||
312 | vpxor x3, x0, x0; | ||
313 | |||
314 | #define SI5_1(x0, x1, x2, x3, x4) \ | ||
315 | vpor x2, x1, tp; \ | ||
316 | vpxor x1, x2, x2; \ | ||
317 | vpxor x3, tp, tp; \ | ||
318 | vpand x1, x3, x3; \ | ||
319 | vpxor x3, x2, x2; \ | ||
320 | vpor x0, x3, x3; \ | ||
321 | vpxor RNOT, x0, x0; \ | ||
322 | vpxor x2, x3, x3; \ | ||
323 | vpor x0, x2, x2; | ||
324 | #define SI5_2(x0, x1, x2, x3, x4) \ | ||
325 | vpxor tp, x1, x4; \ | ||
326 | vpxor x4, x2, x2; \ | ||
327 | vpand x0, x4, x4; \ | ||
328 | vpxor tp, x0, x0; \ | ||
329 | vpxor x3, tp, x1; \ | ||
330 | vpand x2, x0, x0; \ | ||
331 | vpxor x3, x2, x2; \ | ||
332 | vpxor x2, x0, x0; \ | ||
333 | vpxor x4, x2, x2; \ | ||
334 | vpxor x3, x4, x4; | ||
335 | |||
336 | #define SI6_1(x0, x1, x2, x3, x4) \ | ||
337 | vpxor x2, x0, x0; \ | ||
338 | vpand x3, x0, tp; \ | ||
339 | vpxor x3, x2, x2; \ | ||
340 | vpxor x2, tp, tp; \ | ||
341 | vpxor x1, x3, x3; \ | ||
342 | vpor x0, x2, x2; \ | ||
343 | vpxor x3, x2, x2; \ | ||
344 | vpand tp, x3, x3; | ||
345 | #define SI6_2(x0, x1, x2, x3, x4) \ | ||
346 | vpxor RNOT, tp, tp; \ | ||
347 | vpxor x1, x3, x3; \ | ||
348 | vpand x2, x1, x1; \ | ||
349 | vpxor tp, x0, x4; \ | ||
350 | vpxor x4, x3, x3; \ | ||
351 | vpxor x2, x4, x4; \ | ||
352 | vpxor x1, tp, x0; \ | ||
353 | vpxor x0, x2, x2; | ||
354 | |||
355 | #define SI7_1(x0, x1, x2, x3, x4) \ | ||
356 | vpand x0, x3, tp; \ | ||
357 | vpxor x2, x0, x0; \ | ||
358 | vpor x3, x2, x2; \ | ||
359 | vpxor x1, x3, x4; \ | ||
360 | vpxor RNOT, x0, x0; \ | ||
361 | vpor tp, x1, x1; \ | ||
362 | vpxor x0, x4, x4; \ | ||
363 | vpand x2, x0, x0; \ | ||
364 | vpxor x1, x0, x0; | ||
365 | #define SI7_2(x0, x1, x2, x3, x4) \ | ||
366 | vpand x2, x1, x1; \ | ||
367 | vpxor x2, tp, x3; \ | ||
368 | vpxor x3, x4, x4; \ | ||
369 | vpand x3, x2, x2; \ | ||
370 | vpor x0, x3, x3; \ | ||
371 | vpxor x4, x1, x1; \ | ||
372 | vpxor x4, x3, x3; \ | ||
373 | vpand x0, x4, x4; \ | ||
374 | vpxor x2, x4, x4; | ||
375 | |||
376 | #define get_key(i,j,t) \ | ||
377 | vpbroadcastd (4*(i)+(j))*4(CTX), t; | ||
378 | |||
379 | #define K2(x0, x1, x2, x3, x4, i) \ | ||
380 | get_key(i, 0, RK0); \ | ||
381 | get_key(i, 1, RK1); \ | ||
382 | get_key(i, 2, RK2); \ | ||
383 | get_key(i, 3, RK3); \ | ||
384 | vpxor RK0, x0 ## 1, x0 ## 1; \ | ||
385 | vpxor RK1, x1 ## 1, x1 ## 1; \ | ||
386 | vpxor RK2, x2 ## 1, x2 ## 1; \ | ||
387 | vpxor RK3, x3 ## 1, x3 ## 1; \ | ||
388 | vpxor RK0, x0 ## 2, x0 ## 2; \ | ||
389 | vpxor RK1, x1 ## 2, x1 ## 2; \ | ||
390 | vpxor RK2, x2 ## 2, x2 ## 2; \ | ||
391 | vpxor RK3, x3 ## 2, x3 ## 2; | ||
392 | |||
393 | #define LK2(x0, x1, x2, x3, x4, i) \ | ||
394 | vpslld $13, x0 ## 1, x4 ## 1; \ | ||
395 | vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ | ||
396 | vpor x4 ## 1, x0 ## 1, x0 ## 1; \ | ||
397 | vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ | ||
398 | vpslld $3, x2 ## 1, x4 ## 1; \ | ||
399 | vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ | ||
400 | vpor x4 ## 1, x2 ## 1, x2 ## 1; \ | ||
401 | vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ | ||
402 | vpslld $13, x0 ## 2, x4 ## 2; \ | ||
403 | vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ | ||
404 | vpor x4 ## 2, x0 ## 2, x0 ## 2; \ | ||
405 | vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ | ||
406 | vpslld $3, x2 ## 2, x4 ## 2; \ | ||
407 | vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ | ||
408 | vpor x4 ## 2, x2 ## 2, x2 ## 2; \ | ||
409 | vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ | ||
410 | vpslld $1, x1 ## 1, x4 ## 1; \ | ||
411 | vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ | ||
412 | vpor x4 ## 1, x1 ## 1, x1 ## 1; \ | ||
413 | vpslld $3, x0 ## 1, x4 ## 1; \ | ||
414 | vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ | ||
415 | vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ | ||
416 | get_key(i, 1, RK1); \ | ||
417 | vpslld $1, x1 ## 2, x4 ## 2; \ | ||
418 | vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ | ||
419 | vpor x4 ## 2, x1 ## 2, x1 ## 2; \ | ||
420 | vpslld $3, x0 ## 2, x4 ## 2; \ | ||
421 | vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ | ||
422 | vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ | ||
423 | get_key(i, 3, RK3); \ | ||
424 | vpslld $7, x3 ## 1, x4 ## 1; \ | ||
425 | vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ | ||
426 | vpor x4 ## 1, x3 ## 1, x3 ## 1; \ | ||
427 | vpslld $7, x1 ## 1, x4 ## 1; \ | ||
428 | vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ | ||
429 | vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ | ||
430 | vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ | ||
431 | vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ | ||
432 | get_key(i, 0, RK0); \ | ||
433 | vpslld $7, x3 ## 2, x4 ## 2; \ | ||
434 | vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ | ||
435 | vpor x4 ## 2, x3 ## 2, x3 ## 2; \ | ||
436 | vpslld $7, x1 ## 2, x4 ## 2; \ | ||
437 | vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ | ||
438 | vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ | ||
439 | vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ | ||
440 | vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ | ||
441 | get_key(i, 2, RK2); \ | ||
442 | vpxor RK1, x1 ## 1, x1 ## 1; \ | ||
443 | vpxor RK3, x3 ## 1, x3 ## 1; \ | ||
444 | vpslld $5, x0 ## 1, x4 ## 1; \ | ||
445 | vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ | ||
446 | vpor x4 ## 1, x0 ## 1, x0 ## 1; \ | ||
447 | vpslld $22, x2 ## 1, x4 ## 1; \ | ||
448 | vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ | ||
449 | vpor x4 ## 1, x2 ## 1, x2 ## 1; \ | ||
450 | vpxor RK0, x0 ## 1, x0 ## 1; \ | ||
451 | vpxor RK2, x2 ## 1, x2 ## 1; \ | ||
452 | vpxor RK1, x1 ## 2, x1 ## 2; \ | ||
453 | vpxor RK3, x3 ## 2, x3 ## 2; \ | ||
454 | vpslld $5, x0 ## 2, x4 ## 2; \ | ||
455 | vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ | ||
456 | vpor x4 ## 2, x0 ## 2, x0 ## 2; \ | ||
457 | vpslld $22, x2 ## 2, x4 ## 2; \ | ||
458 | vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ | ||
459 | vpor x4 ## 2, x2 ## 2, x2 ## 2; \ | ||
460 | vpxor RK0, x0 ## 2, x0 ## 2; \ | ||
461 | vpxor RK2, x2 ## 2, x2 ## 2; | ||
462 | |||
463 | #define KL2(x0, x1, x2, x3, x4, i) \ | ||
464 | vpxor RK0, x0 ## 1, x0 ## 1; \ | ||
465 | vpxor RK2, x2 ## 1, x2 ## 1; \ | ||
466 | vpsrld $5, x0 ## 1, x4 ## 1; \ | ||
467 | vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ | ||
468 | vpor x4 ## 1, x0 ## 1, x0 ## 1; \ | ||
469 | vpxor RK3, x3 ## 1, x3 ## 1; \ | ||
470 | vpxor RK1, x1 ## 1, x1 ## 1; \ | ||
471 | vpsrld $22, x2 ## 1, x4 ## 1; \ | ||
472 | vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ | ||
473 | vpor x4 ## 1, x2 ## 1, x2 ## 1; \ | ||
474 | vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ | ||
475 | vpxor RK0, x0 ## 2, x0 ## 2; \ | ||
476 | vpxor RK2, x2 ## 2, x2 ## 2; \ | ||
477 | vpsrld $5, x0 ## 2, x4 ## 2; \ | ||
478 | vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ | ||
479 | vpor x4 ## 2, x0 ## 2, x0 ## 2; \ | ||
480 | vpxor RK3, x3 ## 2, x3 ## 2; \ | ||
481 | vpxor RK1, x1 ## 2, x1 ## 2; \ | ||
482 | vpsrld $22, x2 ## 2, x4 ## 2; \ | ||
483 | vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ | ||
484 | vpor x4 ## 2, x2 ## 2, x2 ## 2; \ | ||
485 | vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ | ||
486 | vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ | ||
487 | vpslld $7, x1 ## 1, x4 ## 1; \ | ||
488 | vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ | ||
489 | vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ | ||
490 | vpsrld $1, x1 ## 1, x4 ## 1; \ | ||
491 | vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ | ||
492 | vpor x4 ## 1, x1 ## 1, x1 ## 1; \ | ||
493 | vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ | ||
494 | vpslld $7, x1 ## 2, x4 ## 2; \ | ||
495 | vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ | ||
496 | vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ | ||
497 | vpsrld $1, x1 ## 2, x4 ## 2; \ | ||
498 | vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ | ||
499 | vpor x4 ## 2, x1 ## 2, x1 ## 2; \ | ||
500 | vpsrld $7, x3 ## 1, x4 ## 1; \ | ||
501 | vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ | ||
502 | vpor x4 ## 1, x3 ## 1, x3 ## 1; \ | ||
503 | vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ | ||
504 | vpslld $3, x0 ## 1, x4 ## 1; \ | ||
505 | vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ | ||
506 | vpsrld $7, x3 ## 2, x4 ## 2; \ | ||
507 | vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ | ||
508 | vpor x4 ## 2, x3 ## 2, x3 ## 2; \ | ||
509 | vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ | ||
510 | vpslld $3, x0 ## 2, x4 ## 2; \ | ||
511 | vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ | ||
512 | vpsrld $13, x0 ## 1, x4 ## 1; \ | ||
513 | vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ | ||
514 | vpor x4 ## 1, x0 ## 1, x0 ## 1; \ | ||
515 | vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ | ||
516 | vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ | ||
517 | vpsrld $3, x2 ## 1, x4 ## 1; \ | ||
518 | vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ | ||
519 | vpor x4 ## 1, x2 ## 1, x2 ## 1; \ | ||
520 | vpsrld $13, x0 ## 2, x4 ## 2; \ | ||
521 | vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ | ||
522 | vpor x4 ## 2, x0 ## 2, x0 ## 2; \ | ||
523 | vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ | ||
524 | vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ | ||
525 | vpsrld $3, x2 ## 2, x4 ## 2; \ | ||
526 | vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ | ||
527 | vpor x4 ## 2, x2 ## 2, x2 ## 2; | ||
528 | |||
529 | #define S(SBOX, x0, x1, x2, x3, x4) \ | ||
530 | SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | ||
531 | SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | ||
532 | SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ | ||
533 | SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); | ||
534 | |||
535 | #define SP(SBOX, x0, x1, x2, x3, x4, i) \ | ||
536 | get_key(i, 0, RK0); \ | ||
537 | SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | ||
538 | get_key(i, 2, RK2); \ | ||
539 | SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ | ||
540 | get_key(i, 3, RK3); \ | ||
541 | SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ | ||
542 | get_key(i, 1, RK1); \ | ||
543 | SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ | ||
544 | |||
545 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
546 | vpunpckldq x1, x0, t0; \ | ||
547 | vpunpckhdq x1, x0, t2; \ | ||
548 | vpunpckldq x3, x2, t1; \ | ||
549 | vpunpckhdq x3, x2, x3; \ | ||
550 | \ | ||
551 | vpunpcklqdq t1, t0, x0; \ | ||
552 | vpunpckhqdq t1, t0, x1; \ | ||
553 | vpunpcklqdq x3, t2, x2; \ | ||
554 | vpunpckhqdq x3, t2, x3; | ||
555 | |||
556 | #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ | ||
557 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | ||
558 | |||
559 | #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ | ||
560 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | ||
561 | |||
562 | .align 8 | ||
563 | __serpent_enc_blk16: | ||
564 | /* input: | ||
565 | * %rdi: ctx, CTX | ||
566 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext | ||
567 | * output: | ||
568 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext | ||
569 | */ | ||
570 | |||
571 | vpcmpeqd RNOT, RNOT, RNOT; | ||
572 | |||
573 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
574 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
575 | |||
576 | K2(RA, RB, RC, RD, RE, 0); | ||
577 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); | ||
578 | S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); | ||
579 | S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); | ||
580 | S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); | ||
581 | S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); | ||
582 | S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); | ||
583 | S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); | ||
584 | S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); | ||
585 | S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); | ||
586 | S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); | ||
587 | S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); | ||
588 | S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); | ||
589 | S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); | ||
590 | S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); | ||
591 | S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); | ||
592 | S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); | ||
593 | S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); | ||
594 | S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); | ||
595 | S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); | ||
596 | S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); | ||
597 | S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); | ||
598 | S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); | ||
599 | S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); | ||
600 | S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); | ||
601 | S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); | ||
602 | S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); | ||
603 | S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); | ||
604 | S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); | ||
605 | S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); | ||
606 | S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); | ||
607 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); | ||
608 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); | ||
609 | |||
610 | write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
611 | write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
612 | |||
613 | ret; | ||
614 | ENDPROC(__serpent_enc_blk16) | ||
615 | |||
616 | .align 8 | ||
617 | __serpent_dec_blk16: | ||
618 | /* input: | ||
619 | * %rdi: ctx, CTX | ||
620 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext | ||
621 | * output: | ||
622 | * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext | ||
623 | */ | ||
624 | |||
625 | vpcmpeqd RNOT, RNOT, RNOT; | ||
626 | |||
627 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
628 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
629 | |||
630 | K2(RA, RB, RC, RD, RE, 32); | ||
631 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); | ||
632 | SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); | ||
633 | SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); | ||
634 | SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); | ||
635 | SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); | ||
636 | SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); | ||
637 | SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); | ||
638 | SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); | ||
639 | SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); | ||
640 | SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); | ||
641 | SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); | ||
642 | SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); | ||
643 | SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); | ||
644 | SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); | ||
645 | SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); | ||
646 | SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); | ||
647 | SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); | ||
648 | SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); | ||
649 | SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); | ||
650 | SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); | ||
651 | SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); | ||
652 | SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); | ||
653 | SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); | ||
654 | SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); | ||
655 | SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); | ||
656 | SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); | ||
657 | SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); | ||
658 | SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); | ||
659 | SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); | ||
660 | SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); | ||
661 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); | ||
662 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); | ||
663 | |||
664 | write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); | ||
665 | write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); | ||
666 | |||
667 | ret; | ||
668 | ENDPROC(__serpent_dec_blk16) | ||
669 | |||
670 | ENTRY(serpent_ecb_enc_16way) | ||
671 | /* input: | ||
672 | * %rdi: ctx, CTX | ||
673 | * %rsi: dst | ||
674 | * %rdx: src | ||
675 | */ | ||
676 | |||
677 | vzeroupper; | ||
678 | |||
679 | load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
680 | |||
681 | call __serpent_enc_blk16; | ||
682 | |||
683 | store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
684 | |||
685 | vzeroupper; | ||
686 | |||
687 | ret; | ||
688 | ENDPROC(serpent_ecb_enc_16way) | ||
689 | |||
690 | ENTRY(serpent_ecb_dec_16way) | ||
691 | /* input: | ||
692 | * %rdi: ctx, CTX | ||
693 | * %rsi: dst | ||
694 | * %rdx: src | ||
695 | */ | ||
696 | |||
697 | vzeroupper; | ||
698 | |||
699 | load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
700 | |||
701 | call __serpent_dec_blk16; | ||
702 | |||
703 | store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
704 | |||
705 | vzeroupper; | ||
706 | |||
707 | ret; | ||
708 | ENDPROC(serpent_ecb_dec_16way) | ||
709 | |||
710 | ENTRY(serpent_cbc_dec_16way) | ||
711 | /* input: | ||
712 | * %rdi: ctx, CTX | ||
713 | * %rsi: dst | ||
714 | * %rdx: src | ||
715 | */ | ||
716 | |||
717 | vzeroupper; | ||
718 | |||
719 | load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
720 | |||
721 | call __serpent_dec_blk16; | ||
722 | |||
723 | store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, | ||
724 | RK0); | ||
725 | |||
726 | vzeroupper; | ||
727 | |||
728 | ret; | ||
729 | ENDPROC(serpent_cbc_dec_16way) | ||
730 | |||
731 | ENTRY(serpent_ctr_16way) | ||
732 | /* input: | ||
733 | * %rdi: ctx, CTX | ||
734 | * %rsi: dst (16 blocks) | ||
735 | * %rdx: src (16 blocks) | ||
736 | * %rcx: iv (little endian, 128bit) | ||
737 | */ | ||
738 | |||
739 | vzeroupper; | ||
740 | |||
741 | load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
742 | RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, | ||
743 | tp); | ||
744 | |||
745 | call __serpent_enc_blk16; | ||
746 | |||
747 | store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
748 | |||
749 | vzeroupper; | ||
750 | |||
751 | ret; | ||
752 | ENDPROC(serpent_ctr_16way) | ||
753 | |||
754 | ENTRY(serpent_xts_enc_16way) | ||
755 | /* input: | ||
756 | * %rdi: ctx, CTX | ||
757 | * %rsi: dst (16 blocks) | ||
758 | * %rdx: src (16 blocks) | ||
759 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
760 | */ | ||
761 | |||
762 | vzeroupper; | ||
763 | |||
764 | load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
765 | RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, | ||
766 | .Lxts_gf128mul_and_shl1_mask_0, | ||
767 | .Lxts_gf128mul_and_shl1_mask_1); | ||
768 | |||
769 | call __serpent_enc_blk16; | ||
770 | |||
771 | store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
772 | |||
773 | vzeroupper; | ||
774 | |||
775 | ret; | ||
776 | ENDPROC(serpent_xts_enc_16way) | ||
777 | |||
778 | ENTRY(serpent_xts_dec_16way) | ||
779 | /* input: | ||
780 | * %rdi: ctx, CTX | ||
781 | * %rsi: dst (16 blocks) | ||
782 | * %rdx: src (16 blocks) | ||
783 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
784 | */ | ||
785 | |||
786 | vzeroupper; | ||
787 | |||
788 | load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
789 | RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, | ||
790 | .Lxts_gf128mul_and_shl1_mask_0, | ||
791 | .Lxts_gf128mul_and_shl1_mask_1); | ||
792 | |||
793 | call __serpent_dec_blk16; | ||
794 | |||
795 | store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
796 | |||
797 | vzeroupper; | ||
798 | |||
799 | ret; | ||
800 | ENDPROC(serpent_xts_dec_16way) | ||
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c new file mode 100644 index 000000000000..23aabc6c20a5 --- /dev/null +++ b/arch/x86/crypto/serpent_avx2_glue.c | |||
@@ -0,0 +1,562 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2 assembler optimized version of Serpent | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/lrw.h> | ||
20 | #include <crypto/xts.h> | ||
21 | #include <crypto/serpent.h> | ||
22 | #include <asm/xcr.h> | ||
23 | #include <asm/xsave.h> | ||
24 | #include <asm/crypto/serpent-avx.h> | ||
25 | #include <asm/crypto/ablk_helper.h> | ||
26 | #include <asm/crypto/glue_helper.h> | ||
27 | |||
28 | #define SERPENT_AVX2_PARALLEL_BLOCKS 16 | ||
29 | |||
30 | /* 16-way AVX2 parallel cipher functions */ | ||
31 | asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst, | ||
32 | const u8 *src); | ||
33 | asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst, | ||
34 | const u8 *src); | ||
35 | asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); | ||
36 | |||
37 | asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src, | ||
38 | le128 *iv); | ||
39 | asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, | ||
40 | const u8 *src, le128 *iv); | ||
41 | asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, | ||
42 | const u8 *src, le128 *iv); | ||
43 | |||
44 | static const struct common_glue_ctx serpent_enc = { | ||
45 | .num_funcs = 3, | ||
46 | .fpu_blocks_limit = 8, | ||
47 | |||
48 | .funcs = { { | ||
49 | .num_blocks = 16, | ||
50 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) } | ||
51 | }, { | ||
52 | .num_blocks = 8, | ||
53 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } | ||
54 | }, { | ||
55 | .num_blocks = 1, | ||
56 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } | ||
57 | } } | ||
58 | }; | ||
59 | |||
60 | static const struct common_glue_ctx serpent_ctr = { | ||
61 | .num_funcs = 3, | ||
62 | .fpu_blocks_limit = 8, | ||
63 | |||
64 | .funcs = { { | ||
65 | .num_blocks = 16, | ||
66 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) } | ||
67 | }, { | ||
68 | .num_blocks = 8, | ||
69 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } | ||
70 | }, { | ||
71 | .num_blocks = 1, | ||
72 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } | ||
73 | } } | ||
74 | }; | ||
75 | |||
76 | static const struct common_glue_ctx serpent_enc_xts = { | ||
77 | .num_funcs = 3, | ||
78 | .fpu_blocks_limit = 8, | ||
79 | |||
80 | .funcs = { { | ||
81 | .num_blocks = 16, | ||
82 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) } | ||
83 | }, { | ||
84 | .num_blocks = 8, | ||
85 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } | ||
86 | }, { | ||
87 | .num_blocks = 1, | ||
88 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } | ||
89 | } } | ||
90 | }; | ||
91 | |||
92 | static const struct common_glue_ctx serpent_dec = { | ||
93 | .num_funcs = 3, | ||
94 | .fpu_blocks_limit = 8, | ||
95 | |||
96 | .funcs = { { | ||
97 | .num_blocks = 16, | ||
98 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) } | ||
99 | }, { | ||
100 | .num_blocks = 8, | ||
101 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } | ||
102 | }, { | ||
103 | .num_blocks = 1, | ||
104 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } | ||
105 | } } | ||
106 | }; | ||
107 | |||
108 | static const struct common_glue_ctx serpent_dec_cbc = { | ||
109 | .num_funcs = 3, | ||
110 | .fpu_blocks_limit = 8, | ||
111 | |||
112 | .funcs = { { | ||
113 | .num_blocks = 16, | ||
114 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) } | ||
115 | }, { | ||
116 | .num_blocks = 8, | ||
117 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } | ||
118 | }, { | ||
119 | .num_blocks = 1, | ||
120 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } | ||
121 | } } | ||
122 | }; | ||
123 | |||
124 | static const struct common_glue_ctx serpent_dec_xts = { | ||
125 | .num_funcs = 3, | ||
126 | .fpu_blocks_limit = 8, | ||
127 | |||
128 | .funcs = { { | ||
129 | .num_blocks = 16, | ||
130 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) } | ||
131 | }, { | ||
132 | .num_blocks = 8, | ||
133 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } | ||
134 | }, { | ||
135 | .num_blocks = 1, | ||
136 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } | ||
137 | } } | ||
138 | }; | ||
139 | |||
140 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
141 | struct scatterlist *src, unsigned int nbytes) | ||
142 | { | ||
143 | return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); | ||
144 | } | ||
145 | |||
146 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
147 | struct scatterlist *src, unsigned int nbytes) | ||
148 | { | ||
149 | return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); | ||
150 | } | ||
151 | |||
152 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
153 | struct scatterlist *src, unsigned int nbytes) | ||
154 | { | ||
155 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, | ||
156 | dst, src, nbytes); | ||
157 | } | ||
158 | |||
159 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
160 | struct scatterlist *src, unsigned int nbytes) | ||
161 | { | ||
162 | return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, | ||
163 | nbytes); | ||
164 | } | ||
165 | |||
166 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
167 | struct scatterlist *src, unsigned int nbytes) | ||
168 | { | ||
169 | return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); | ||
170 | } | ||
171 | |||
172 | static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
173 | { | ||
174 | /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ | ||
175 | return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); | ||
176 | } | ||
177 | |||
178 | static inline void serpent_fpu_end(bool fpu_enabled) | ||
179 | { | ||
180 | glue_fpu_end(fpu_enabled); | ||
181 | } | ||
182 | |||
183 | struct crypt_priv { | ||
184 | struct serpent_ctx *ctx; | ||
185 | bool fpu_enabled; | ||
186 | }; | ||
187 | |||
188 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
189 | { | ||
190 | const unsigned int bsize = SERPENT_BLOCK_SIZE; | ||
191 | struct crypt_priv *ctx = priv; | ||
192 | int i; | ||
193 | |||
194 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | ||
195 | |||
196 | if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { | ||
197 | serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
198 | srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; | ||
199 | nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; | ||
200 | } | ||
201 | |||
202 | while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { | ||
203 | serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); | ||
204 | srcdst += bsize * SERPENT_PARALLEL_BLOCKS; | ||
205 | nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; | ||
206 | } | ||
207 | |||
208 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
209 | __serpent_encrypt(ctx->ctx, srcdst, srcdst); | ||
210 | } | ||
211 | |||
212 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
213 | { | ||
214 | const unsigned int bsize = SERPENT_BLOCK_SIZE; | ||
215 | struct crypt_priv *ctx = priv; | ||
216 | int i; | ||
217 | |||
218 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | ||
219 | |||
220 | if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { | ||
221 | serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
222 | srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; | ||
223 | nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; | ||
224 | } | ||
225 | |||
226 | while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { | ||
227 | serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); | ||
228 | srcdst += bsize * SERPENT_PARALLEL_BLOCKS; | ||
229 | nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; | ||
230 | } | ||
231 | |||
232 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
233 | __serpent_decrypt(ctx->ctx, srcdst, srcdst); | ||
234 | } | ||
235 | |||
236 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
237 | struct scatterlist *src, unsigned int nbytes) | ||
238 | { | ||
239 | struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
240 | be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; | ||
241 | struct crypt_priv crypt_ctx = { | ||
242 | .ctx = &ctx->serpent_ctx, | ||
243 | .fpu_enabled = false, | ||
244 | }; | ||
245 | struct lrw_crypt_req req = { | ||
246 | .tbuf = buf, | ||
247 | .tbuflen = sizeof(buf), | ||
248 | |||
249 | .table_ctx = &ctx->lrw_table, | ||
250 | .crypt_ctx = &crypt_ctx, | ||
251 | .crypt_fn = encrypt_callback, | ||
252 | }; | ||
253 | int ret; | ||
254 | |||
255 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
256 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
257 | serpent_fpu_end(crypt_ctx.fpu_enabled); | ||
258 | |||
259 | return ret; | ||
260 | } | ||
261 | |||
262 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
263 | struct scatterlist *src, unsigned int nbytes) | ||
264 | { | ||
265 | struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
266 | be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; | ||
267 | struct crypt_priv crypt_ctx = { | ||
268 | .ctx = &ctx->serpent_ctx, | ||
269 | .fpu_enabled = false, | ||
270 | }; | ||
271 | struct lrw_crypt_req req = { | ||
272 | .tbuf = buf, | ||
273 | .tbuflen = sizeof(buf), | ||
274 | |||
275 | .table_ctx = &ctx->lrw_table, | ||
276 | .crypt_ctx = &crypt_ctx, | ||
277 | .crypt_fn = decrypt_callback, | ||
278 | }; | ||
279 | int ret; | ||
280 | |||
281 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
282 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
283 | serpent_fpu_end(crypt_ctx.fpu_enabled); | ||
284 | |||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
289 | struct scatterlist *src, unsigned int nbytes) | ||
290 | { | ||
291 | struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
292 | |||
293 | return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, | ||
294 | XTS_TWEAK_CAST(__serpent_encrypt), | ||
295 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
296 | } | ||
297 | |||
298 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
299 | struct scatterlist *src, unsigned int nbytes) | ||
300 | { | ||
301 | struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
302 | |||
303 | return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, | ||
304 | XTS_TWEAK_CAST(__serpent_encrypt), | ||
305 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
306 | } | ||
307 | |||
308 | static struct crypto_alg srp_algs[10] = { { | ||
309 | .cra_name = "__ecb-serpent-avx2", | ||
310 | .cra_driver_name = "__driver-ecb-serpent-avx2", | ||
311 | .cra_priority = 0, | ||
312 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
313 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
314 | .cra_ctxsize = sizeof(struct serpent_ctx), | ||
315 | .cra_alignmask = 0, | ||
316 | .cra_type = &crypto_blkcipher_type, | ||
317 | .cra_module = THIS_MODULE, | ||
318 | .cra_list = LIST_HEAD_INIT(srp_algs[0].cra_list), | ||
319 | .cra_u = { | ||
320 | .blkcipher = { | ||
321 | .min_keysize = SERPENT_MIN_KEY_SIZE, | ||
322 | .max_keysize = SERPENT_MAX_KEY_SIZE, | ||
323 | .setkey = serpent_setkey, | ||
324 | .encrypt = ecb_encrypt, | ||
325 | .decrypt = ecb_decrypt, | ||
326 | }, | ||
327 | }, | ||
328 | }, { | ||
329 | .cra_name = "__cbc-serpent-avx2", | ||
330 | .cra_driver_name = "__driver-cbc-serpent-avx2", | ||
331 | .cra_priority = 0, | ||
332 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
333 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
334 | .cra_ctxsize = sizeof(struct serpent_ctx), | ||
335 | .cra_alignmask = 0, | ||
336 | .cra_type = &crypto_blkcipher_type, | ||
337 | .cra_module = THIS_MODULE, | ||
338 | .cra_list = LIST_HEAD_INIT(srp_algs[1].cra_list), | ||
339 | .cra_u = { | ||
340 | .blkcipher = { | ||
341 | .min_keysize = SERPENT_MIN_KEY_SIZE, | ||
342 | .max_keysize = SERPENT_MAX_KEY_SIZE, | ||
343 | .setkey = serpent_setkey, | ||
344 | .encrypt = cbc_encrypt, | ||
345 | .decrypt = cbc_decrypt, | ||
346 | }, | ||
347 | }, | ||
348 | }, { | ||
349 | .cra_name = "__ctr-serpent-avx2", | ||
350 | .cra_driver_name = "__driver-ctr-serpent-avx2", | ||
351 | .cra_priority = 0, | ||
352 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
353 | .cra_blocksize = 1, | ||
354 | .cra_ctxsize = sizeof(struct serpent_ctx), | ||
355 | .cra_alignmask = 0, | ||
356 | .cra_type = &crypto_blkcipher_type, | ||
357 | .cra_module = THIS_MODULE, | ||
358 | .cra_list = LIST_HEAD_INIT(srp_algs[2].cra_list), | ||
359 | .cra_u = { | ||
360 | .blkcipher = { | ||
361 | .min_keysize = SERPENT_MIN_KEY_SIZE, | ||
362 | .max_keysize = SERPENT_MAX_KEY_SIZE, | ||
363 | .ivsize = SERPENT_BLOCK_SIZE, | ||
364 | .setkey = serpent_setkey, | ||
365 | .encrypt = ctr_crypt, | ||
366 | .decrypt = ctr_crypt, | ||
367 | }, | ||
368 | }, | ||
369 | }, { | ||
370 | .cra_name = "__lrw-serpent-avx2", | ||
371 | .cra_driver_name = "__driver-lrw-serpent-avx2", | ||
372 | .cra_priority = 0, | ||
373 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
374 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
375 | .cra_ctxsize = sizeof(struct serpent_lrw_ctx), | ||
376 | .cra_alignmask = 0, | ||
377 | .cra_type = &crypto_blkcipher_type, | ||
378 | .cra_module = THIS_MODULE, | ||
379 | .cra_list = LIST_HEAD_INIT(srp_algs[3].cra_list), | ||
380 | .cra_exit = lrw_serpent_exit_tfm, | ||
381 | .cra_u = { | ||
382 | .blkcipher = { | ||
383 | .min_keysize = SERPENT_MIN_KEY_SIZE + | ||
384 | SERPENT_BLOCK_SIZE, | ||
385 | .max_keysize = SERPENT_MAX_KEY_SIZE + | ||
386 | SERPENT_BLOCK_SIZE, | ||
387 | .ivsize = SERPENT_BLOCK_SIZE, | ||
388 | .setkey = lrw_serpent_setkey, | ||
389 | .encrypt = lrw_encrypt, | ||
390 | .decrypt = lrw_decrypt, | ||
391 | }, | ||
392 | }, | ||
393 | }, { | ||
394 | .cra_name = "__xts-serpent-avx2", | ||
395 | .cra_driver_name = "__driver-xts-serpent-avx2", | ||
396 | .cra_priority = 0, | ||
397 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
398 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
399 | .cra_ctxsize = sizeof(struct serpent_xts_ctx), | ||
400 | .cra_alignmask = 0, | ||
401 | .cra_type = &crypto_blkcipher_type, | ||
402 | .cra_module = THIS_MODULE, | ||
403 | .cra_list = LIST_HEAD_INIT(srp_algs[4].cra_list), | ||
404 | .cra_u = { | ||
405 | .blkcipher = { | ||
406 | .min_keysize = SERPENT_MIN_KEY_SIZE * 2, | ||
407 | .max_keysize = SERPENT_MAX_KEY_SIZE * 2, | ||
408 | .ivsize = SERPENT_BLOCK_SIZE, | ||
409 | .setkey = xts_serpent_setkey, | ||
410 | .encrypt = xts_encrypt, | ||
411 | .decrypt = xts_decrypt, | ||
412 | }, | ||
413 | }, | ||
414 | }, { | ||
415 | .cra_name = "ecb(serpent)", | ||
416 | .cra_driver_name = "ecb-serpent-avx2", | ||
417 | .cra_priority = 600, | ||
418 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
419 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
420 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
421 | .cra_alignmask = 0, | ||
422 | .cra_type = &crypto_ablkcipher_type, | ||
423 | .cra_module = THIS_MODULE, | ||
424 | .cra_list = LIST_HEAD_INIT(srp_algs[5].cra_list), | ||
425 | .cra_init = ablk_init, | ||
426 | .cra_exit = ablk_exit, | ||
427 | .cra_u = { | ||
428 | .ablkcipher = { | ||
429 | .min_keysize = SERPENT_MIN_KEY_SIZE, | ||
430 | .max_keysize = SERPENT_MAX_KEY_SIZE, | ||
431 | .setkey = ablk_set_key, | ||
432 | .encrypt = ablk_encrypt, | ||
433 | .decrypt = ablk_decrypt, | ||
434 | }, | ||
435 | }, | ||
436 | }, { | ||
437 | .cra_name = "cbc(serpent)", | ||
438 | .cra_driver_name = "cbc-serpent-avx2", | ||
439 | .cra_priority = 600, | ||
440 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
441 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
442 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
443 | .cra_alignmask = 0, | ||
444 | .cra_type = &crypto_ablkcipher_type, | ||
445 | .cra_module = THIS_MODULE, | ||
446 | .cra_list = LIST_HEAD_INIT(srp_algs[6].cra_list), | ||
447 | .cra_init = ablk_init, | ||
448 | .cra_exit = ablk_exit, | ||
449 | .cra_u = { | ||
450 | .ablkcipher = { | ||
451 | .min_keysize = SERPENT_MIN_KEY_SIZE, | ||
452 | .max_keysize = SERPENT_MAX_KEY_SIZE, | ||
453 | .ivsize = SERPENT_BLOCK_SIZE, | ||
454 | .setkey = ablk_set_key, | ||
455 | .encrypt = __ablk_encrypt, | ||
456 | .decrypt = ablk_decrypt, | ||
457 | }, | ||
458 | }, | ||
459 | }, { | ||
460 | .cra_name = "ctr(serpent)", | ||
461 | .cra_driver_name = "ctr-serpent-avx2", | ||
462 | .cra_priority = 600, | ||
463 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
464 | .cra_blocksize = 1, | ||
465 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
466 | .cra_alignmask = 0, | ||
467 | .cra_type = &crypto_ablkcipher_type, | ||
468 | .cra_module = THIS_MODULE, | ||
469 | .cra_list = LIST_HEAD_INIT(srp_algs[7].cra_list), | ||
470 | .cra_init = ablk_init, | ||
471 | .cra_exit = ablk_exit, | ||
472 | .cra_u = { | ||
473 | .ablkcipher = { | ||
474 | .min_keysize = SERPENT_MIN_KEY_SIZE, | ||
475 | .max_keysize = SERPENT_MAX_KEY_SIZE, | ||
476 | .ivsize = SERPENT_BLOCK_SIZE, | ||
477 | .setkey = ablk_set_key, | ||
478 | .encrypt = ablk_encrypt, | ||
479 | .decrypt = ablk_encrypt, | ||
480 | .geniv = "chainiv", | ||
481 | }, | ||
482 | }, | ||
483 | }, { | ||
484 | .cra_name = "lrw(serpent)", | ||
485 | .cra_driver_name = "lrw-serpent-avx2", | ||
486 | .cra_priority = 600, | ||
487 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
488 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
489 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
490 | .cra_alignmask = 0, | ||
491 | .cra_type = &crypto_ablkcipher_type, | ||
492 | .cra_module = THIS_MODULE, | ||
493 | .cra_list = LIST_HEAD_INIT(srp_algs[8].cra_list), | ||
494 | .cra_init = ablk_init, | ||
495 | .cra_exit = ablk_exit, | ||
496 | .cra_u = { | ||
497 | .ablkcipher = { | ||
498 | .min_keysize = SERPENT_MIN_KEY_SIZE + | ||
499 | SERPENT_BLOCK_SIZE, | ||
500 | .max_keysize = SERPENT_MAX_KEY_SIZE + | ||
501 | SERPENT_BLOCK_SIZE, | ||
502 | .ivsize = SERPENT_BLOCK_SIZE, | ||
503 | .setkey = ablk_set_key, | ||
504 | .encrypt = ablk_encrypt, | ||
505 | .decrypt = ablk_decrypt, | ||
506 | }, | ||
507 | }, | ||
508 | }, { | ||
509 | .cra_name = "xts(serpent)", | ||
510 | .cra_driver_name = "xts-serpent-avx2", | ||
511 | .cra_priority = 600, | ||
512 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
513 | .cra_blocksize = SERPENT_BLOCK_SIZE, | ||
514 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
515 | .cra_alignmask = 0, | ||
516 | .cra_type = &crypto_ablkcipher_type, | ||
517 | .cra_module = THIS_MODULE, | ||
518 | .cra_list = LIST_HEAD_INIT(srp_algs[9].cra_list), | ||
519 | .cra_init = ablk_init, | ||
520 | .cra_exit = ablk_exit, | ||
521 | .cra_u = { | ||
522 | .ablkcipher = { | ||
523 | .min_keysize = SERPENT_MIN_KEY_SIZE * 2, | ||
524 | .max_keysize = SERPENT_MAX_KEY_SIZE * 2, | ||
525 | .ivsize = SERPENT_BLOCK_SIZE, | ||
526 | .setkey = ablk_set_key, | ||
527 | .encrypt = ablk_encrypt, | ||
528 | .decrypt = ablk_decrypt, | ||
529 | }, | ||
530 | }, | ||
531 | } }; | ||
532 | |||
533 | static int __init init(void) | ||
534 | { | ||
535 | u64 xcr0; | ||
536 | |||
537 | if (!cpu_has_avx2 || !cpu_has_osxsave) { | ||
538 | pr_info("AVX2 instructions are not detected.\n"); | ||
539 | return -ENODEV; | ||
540 | } | ||
541 | |||
542 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
543 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
544 | pr_info("AVX detected but unusable.\n"); | ||
545 | return -ENODEV; | ||
546 | } | ||
547 | |||
548 | return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs)); | ||
549 | } | ||
550 | |||
551 | static void __exit fini(void) | ||
552 | { | ||
553 | crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs)); | ||
554 | } | ||
555 | |||
556 | module_init(init); | ||
557 | module_exit(fini); | ||
558 | |||
559 | MODULE_LICENSE("GPL"); | ||
560 | MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); | ||
561 | MODULE_ALIAS("serpent"); | ||
562 | MODULE_ALIAS("serpent-asm"); | ||
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 52abaaf28e7f..9ae83cf8d21e 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c | |||
@@ -4,8 +4,7 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Glue code based on serpent_sse2_glue.c by: | 7 | * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
8 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
9 | * | 8 | * |
10 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
@@ -42,7 +41,32 @@ | |||
42 | #include <asm/crypto/ablk_helper.h> | 41 | #include <asm/crypto/ablk_helper.h> |
43 | #include <asm/crypto/glue_helper.h> | 42 | #include <asm/crypto/glue_helper.h> |
44 | 43 | ||
45 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 44 | /* 8-way parallel cipher functions */ |
45 | asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
46 | const u8 *src); | ||
47 | EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx); | ||
48 | |||
49 | asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
50 | const u8 *src); | ||
51 | EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx); | ||
52 | |||
53 | asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
54 | const u8 *src); | ||
55 | EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); | ||
56 | |||
57 | asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
58 | const u8 *src, le128 *iv); | ||
59 | EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); | ||
60 | |||
61 | asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
62 | const u8 *src, le128 *iv); | ||
63 | EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); | ||
64 | |||
65 | asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
66 | const u8 *src, le128 *iv); | ||
67 | EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); | ||
68 | |||
69 | void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
46 | { | 70 | { |
47 | be128 ctrblk; | 71 | be128 ctrblk; |
48 | 72 | ||
@@ -52,6 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | |||
52 | __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); | 76 | __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); |
53 | u128_xor(dst, src, (u128 *)&ctrblk); | 77 | u128_xor(dst, src, (u128 *)&ctrblk); |
54 | } | 78 | } |
79 | EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); | ||
80 | |||
81 | void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
82 | { | ||
83 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
84 | GLUE_FUNC_CAST(__serpent_encrypt)); | ||
85 | } | ||
86 | EXPORT_SYMBOL_GPL(serpent_xts_enc); | ||
87 | |||
88 | void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
89 | { | ||
90 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
91 | GLUE_FUNC_CAST(__serpent_decrypt)); | ||
92 | } | ||
93 | EXPORT_SYMBOL_GPL(serpent_xts_dec); | ||
94 | |||
55 | 95 | ||
56 | static const struct common_glue_ctx serpent_enc = { | 96 | static const struct common_glue_ctx serpent_enc = { |
57 | .num_funcs = 2, | 97 | .num_funcs = 2, |
@@ -75,7 +115,20 @@ static const struct common_glue_ctx serpent_ctr = { | |||
75 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } | 115 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } |
76 | }, { | 116 | }, { |
77 | .num_blocks = 1, | 117 | .num_blocks = 1, |
78 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } | 118 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } |
119 | } } | ||
120 | }; | ||
121 | |||
122 | static const struct common_glue_ctx serpent_enc_xts = { | ||
123 | .num_funcs = 2, | ||
124 | .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, | ||
125 | |||
126 | .funcs = { { | ||
127 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | ||
128 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } | ||
129 | }, { | ||
130 | .num_blocks = 1, | ||
131 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } | ||
79 | } } | 132 | } } |
80 | }; | 133 | }; |
81 | 134 | ||
@@ -105,6 +158,19 @@ static const struct common_glue_ctx serpent_dec_cbc = { | |||
105 | } } | 158 | } } |
106 | }; | 159 | }; |
107 | 160 | ||
161 | static const struct common_glue_ctx serpent_dec_xts = { | ||
162 | .num_funcs = 2, | ||
163 | .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, | ||
164 | |||
165 | .funcs = { { | ||
166 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | ||
167 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } | ||
168 | }, { | ||
169 | .num_blocks = 1, | ||
170 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } | ||
171 | } } | ||
172 | }; | ||
173 | |||
108 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 174 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
109 | struct scatterlist *src, unsigned int nbytes) | 175 | struct scatterlist *src, unsigned int nbytes) |
110 | { | 176 | { |
@@ -187,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
187 | __serpent_decrypt(ctx->ctx, srcdst, srcdst); | 253 | __serpent_decrypt(ctx->ctx, srcdst, srcdst); |
188 | } | 254 | } |
189 | 255 | ||
190 | struct serpent_lrw_ctx { | 256 | int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, |
191 | struct lrw_table_ctx lrw_table; | 257 | unsigned int keylen) |
192 | struct serpent_ctx serpent_ctx; | ||
193 | }; | ||
194 | |||
195 | static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
196 | unsigned int keylen) | ||
197 | { | 258 | { |
198 | struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); | 259 | struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); |
199 | int err; | 260 | int err; |
@@ -206,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, | |||
206 | return lrw_init_table(&ctx->lrw_table, key + keylen - | 267 | return lrw_init_table(&ctx->lrw_table, key + keylen - |
207 | SERPENT_BLOCK_SIZE); | 268 | SERPENT_BLOCK_SIZE); |
208 | } | 269 | } |
270 | EXPORT_SYMBOL_GPL(lrw_serpent_setkey); | ||
209 | 271 | ||
210 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 272 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
211 | struct scatterlist *src, unsigned int nbytes) | 273 | struct scatterlist *src, unsigned int nbytes) |
@@ -259,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
259 | return ret; | 321 | return ret; |
260 | } | 322 | } |
261 | 323 | ||
262 | static void lrw_exit_tfm(struct crypto_tfm *tfm) | 324 | void lrw_serpent_exit_tfm(struct crypto_tfm *tfm) |
263 | { | 325 | { |
264 | struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); | 326 | struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); |
265 | 327 | ||
266 | lrw_free_table(&ctx->lrw_table); | 328 | lrw_free_table(&ctx->lrw_table); |
267 | } | 329 | } |
330 | EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm); | ||
268 | 331 | ||
269 | struct serpent_xts_ctx { | 332 | int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, |
270 | struct serpent_ctx tweak_ctx; | 333 | unsigned int keylen) |
271 | struct serpent_ctx crypt_ctx; | ||
272 | }; | ||
273 | |||
274 | static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
275 | unsigned int keylen) | ||
276 | { | 334 | { |
277 | struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); | 335 | struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); |
278 | u32 *flags = &tfm->crt_flags; | 336 | u32 *flags = &tfm->crt_flags; |
@@ -294,59 +352,26 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, | |||
294 | /* second half of xts-key is for tweak */ | 352 | /* second half of xts-key is for tweak */ |
295 | return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); | 353 | return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); |
296 | } | 354 | } |
355 | EXPORT_SYMBOL_GPL(xts_serpent_setkey); | ||
297 | 356 | ||
298 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 357 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
299 | struct scatterlist *src, unsigned int nbytes) | 358 | struct scatterlist *src, unsigned int nbytes) |
300 | { | 359 | { |
301 | struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 360 | struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
302 | be128 buf[SERPENT_PARALLEL_BLOCKS]; | ||
303 | struct crypt_priv crypt_ctx = { | ||
304 | .ctx = &ctx->crypt_ctx, | ||
305 | .fpu_enabled = false, | ||
306 | }; | ||
307 | struct xts_crypt_req req = { | ||
308 | .tbuf = buf, | ||
309 | .tbuflen = sizeof(buf), | ||
310 | |||
311 | .tweak_ctx = &ctx->tweak_ctx, | ||
312 | .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), | ||
313 | .crypt_ctx = &crypt_ctx, | ||
314 | .crypt_fn = encrypt_callback, | ||
315 | }; | ||
316 | int ret; | ||
317 | |||
318 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
319 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
320 | serpent_fpu_end(crypt_ctx.fpu_enabled); | ||
321 | 361 | ||
322 | return ret; | 362 | return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, |
363 | XTS_TWEAK_CAST(__serpent_encrypt), | ||
364 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
323 | } | 365 | } |
324 | 366 | ||
325 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 367 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
326 | struct scatterlist *src, unsigned int nbytes) | 368 | struct scatterlist *src, unsigned int nbytes) |
327 | { | 369 | { |
328 | struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 370 | struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
329 | be128 buf[SERPENT_PARALLEL_BLOCKS]; | ||
330 | struct crypt_priv crypt_ctx = { | ||
331 | .ctx = &ctx->crypt_ctx, | ||
332 | .fpu_enabled = false, | ||
333 | }; | ||
334 | struct xts_crypt_req req = { | ||
335 | .tbuf = buf, | ||
336 | .tbuflen = sizeof(buf), | ||
337 | |||
338 | .tweak_ctx = &ctx->tweak_ctx, | ||
339 | .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), | ||
340 | .crypt_ctx = &crypt_ctx, | ||
341 | .crypt_fn = decrypt_callback, | ||
342 | }; | ||
343 | int ret; | ||
344 | 371 | ||
345 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | 372 | return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, |
346 | ret = xts_crypt(desc, dst, src, nbytes, &req); | 373 | XTS_TWEAK_CAST(__serpent_encrypt), |
347 | serpent_fpu_end(crypt_ctx.fpu_enabled); | 374 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
348 | |||
349 | return ret; | ||
350 | } | 375 | } |
351 | 376 | ||
352 | static struct crypto_alg serpent_algs[10] = { { | 377 | static struct crypto_alg serpent_algs[10] = { { |
@@ -417,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { { | |||
417 | .cra_alignmask = 0, | 442 | .cra_alignmask = 0, |
418 | .cra_type = &crypto_blkcipher_type, | 443 | .cra_type = &crypto_blkcipher_type, |
419 | .cra_module = THIS_MODULE, | 444 | .cra_module = THIS_MODULE, |
420 | .cra_exit = lrw_exit_tfm, | 445 | .cra_exit = lrw_serpent_exit_tfm, |
421 | .cra_u = { | 446 | .cra_u = { |
422 | .blkcipher = { | 447 | .blkcipher = { |
423 | .min_keysize = SERPENT_MIN_KEY_SIZE + | 448 | .min_keysize = SERPENT_MIN_KEY_SIZE + |
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S new file mode 100644 index 000000000000..56610c4bf31b --- /dev/null +++ b/arch/x86/crypto/sha256-avx-asm.S | |||
@@ -0,0 +1,496 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-256 with AVX1 instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
10 | # | ||
11 | # This software is available to you under a choice of one of two | ||
12 | # licenses. You may choose to be licensed under the terms of the GNU | ||
13 | # General Public License (GPL) Version 2, available from the file | ||
14 | # COPYING in the main directory of this source tree, or the | ||
15 | # OpenIB.org BSD license below: | ||
16 | # | ||
17 | # Redistribution and use in source and binary forms, with or | ||
18 | # without modification, are permitted provided that the following | ||
19 | # conditions are met: | ||
20 | # | ||
21 | # - Redistributions of source code must retain the above | ||
22 | # copyright notice, this list of conditions and the following | ||
23 | # disclaimer. | ||
24 | # | ||
25 | # - Redistributions in binary form must reproduce the above | ||
26 | # copyright notice, this list of conditions and the following | ||
27 | # disclaimer in the documentation and/or other materials | ||
28 | # provided with the distribution. | ||
29 | # | ||
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
31 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
32 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
33 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
34 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
35 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
36 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
37 | # SOFTWARE. | ||
38 | ######################################################################## | ||
39 | # | ||
40 | # This code is described in an Intel White-Paper: | ||
41 | # "Fast SHA-256 Implementations on Intel Architecture Processors" | ||
42 | # | ||
43 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
44 | # and search for that title. | ||
45 | # | ||
46 | ######################################################################## | ||
47 | # This code schedules 1 block at a time, with 4 lanes per block | ||
48 | ######################################################################## | ||
49 | |||
50 | #ifdef CONFIG_AS_AVX | ||
51 | #include <linux/linkage.h> | ||
52 | |||
53 | ## assume buffers not aligned | ||
54 | #define VMOVDQ vmovdqu | ||
55 | |||
56 | ################################ Define Macros | ||
57 | |||
58 | # addm [mem], reg | ||
59 | # Add reg to mem using reg-mem add and store | ||
60 | .macro addm p1 p2 | ||
61 | add \p1, \p2 | ||
62 | mov \p2, \p1 | ||
63 | .endm | ||
64 | |||
65 | |||
66 | .macro MY_ROR p1 p2 | ||
67 | shld $(32-(\p1)), \p2, \p2 | ||
68 | .endm | ||
69 | |||
70 | ################################ | ||
71 | |||
72 | # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask | ||
73 | # Load xmm with mem and byte swap each dword | ||
74 | .macro COPY_XMM_AND_BSWAP p1 p2 p3 | ||
75 | VMOVDQ \p2, \p1 | ||
76 | vpshufb \p3, \p1, \p1 | ||
77 | .endm | ||
78 | |||
79 | ################################ | ||
80 | |||
81 | X0 = %xmm4 | ||
82 | X1 = %xmm5 | ||
83 | X2 = %xmm6 | ||
84 | X3 = %xmm7 | ||
85 | |||
86 | XTMP0 = %xmm0 | ||
87 | XTMP1 = %xmm1 | ||
88 | XTMP2 = %xmm2 | ||
89 | XTMP3 = %xmm3 | ||
90 | XTMP4 = %xmm8 | ||
91 | XFER = %xmm9 | ||
92 | XTMP5 = %xmm11 | ||
93 | |||
94 | SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA | ||
95 | SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 | ||
96 | BYTE_FLIP_MASK = %xmm13 | ||
97 | |||
98 | NUM_BLKS = %rdx # 3rd arg | ||
99 | CTX = %rsi # 2nd arg | ||
100 | INP = %rdi # 1st arg | ||
101 | |||
102 | SRND = %rdi # clobbers INP | ||
103 | c = %ecx | ||
104 | d = %r8d | ||
105 | e = %edx | ||
106 | TBL = %rbp | ||
107 | a = %eax | ||
108 | b = %ebx | ||
109 | |||
110 | f = %r9d | ||
111 | g = %r10d | ||
112 | h = %r11d | ||
113 | |||
114 | y0 = %r13d | ||
115 | y1 = %r14d | ||
116 | y2 = %r15d | ||
117 | |||
118 | |||
119 | _INP_END_SIZE = 8 | ||
120 | _INP_SIZE = 8 | ||
121 | _XFER_SIZE = 8 | ||
122 | _XMM_SAVE_SIZE = 0 | ||
123 | |||
124 | _INP_END = 0 | ||
125 | _INP = _INP_END + _INP_END_SIZE | ||
126 | _XFER = _INP + _INP_SIZE | ||
127 | _XMM_SAVE = _XFER + _XFER_SIZE | ||
128 | STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE | ||
129 | |||
130 | # rotate_Xs | ||
131 | # Rotate values of symbols X0...X3 | ||
132 | .macro rotate_Xs | ||
133 | X_ = X0 | ||
134 | X0 = X1 | ||
135 | X1 = X2 | ||
136 | X2 = X3 | ||
137 | X3 = X_ | ||
138 | .endm | ||
139 | |||
140 | # ROTATE_ARGS | ||
141 | # Rotate values of symbols a...h | ||
142 | .macro ROTATE_ARGS | ||
143 | TMP_ = h | ||
144 | h = g | ||
145 | g = f | ||
146 | f = e | ||
147 | e = d | ||
148 | d = c | ||
149 | c = b | ||
150 | b = a | ||
151 | a = TMP_ | ||
152 | .endm | ||
153 | |||
154 | .macro FOUR_ROUNDS_AND_SCHED | ||
155 | ## compute s0 four at a time and s1 two at a time | ||
156 | ## compute W[-16] + W[-7] 4 at a time | ||
157 | |||
158 | mov e, y0 # y0 = e | ||
159 | MY_ROR (25-11), y0 # y0 = e >> (25-11) | ||
160 | mov a, y1 # y1 = a | ||
161 | vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] | ||
162 | MY_ROR (22-13), y1 # y1 = a >> (22-13) | ||
163 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
164 | mov f, y2 # y2 = f | ||
165 | MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
166 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
167 | xor g, y2 # y2 = f^g | ||
168 | vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] | ||
169 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
170 | and e, y2 # y2 = (f^g)&e | ||
171 | MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
172 | ## compute s0 | ||
173 | vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] | ||
174 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
175 | MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
176 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
177 | MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
178 | add y0, y2 # y2 = S1 + CH | ||
179 | add _XFER(%rsp), y2 # y2 = k + w + S1 + CH | ||
180 | mov a, y0 # y0 = a | ||
181 | add y2, h # h = h + S1 + CH + k + w | ||
182 | mov a, y2 # y2 = a | ||
183 | vpsrld $7, XTMP1, XTMP2 | ||
184 | or c, y0 # y0 = a|c | ||
185 | add h, d # d = d + h + S1 + CH + k + w | ||
186 | and c, y2 # y2 = a&c | ||
187 | vpslld $(32-7), XTMP1, XTMP3 | ||
188 | and b, y0 # y0 = (a|c)&b | ||
189 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
190 | vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 | ||
191 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
192 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
193 | ROTATE_ARGS | ||
194 | mov e, y0 # y0 = e | ||
195 | mov a, y1 # y1 = a | ||
196 | MY_ROR (25-11), y0 # y0 = e >> (25-11) | ||
197 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
198 | mov f, y2 # y2 = f | ||
199 | MY_ROR (22-13), y1 # y1 = a >> (22-13) | ||
200 | vpsrld $18, XTMP1, XTMP2 # | ||
201 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
202 | MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
203 | xor g, y2 # y2 = f^g | ||
204 | vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 | ||
205 | MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
206 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
207 | and e, y2 # y2 = (f^g)&e | ||
208 | MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
209 | vpslld $(32-18), XTMP1, XTMP1 | ||
210 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
211 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
212 | vpxor XTMP1, XTMP3, XTMP3 # | ||
213 | add y0, y2 # y2 = S1 + CH | ||
214 | add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH | ||
215 | MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
216 | vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR | ||
217 | mov a, y0 # y0 = a | ||
218 | add y2, h # h = h + S1 + CH + k + w | ||
219 | mov a, y2 # y2 = a | ||
220 | vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 | ||
221 | or c, y0 # y0 = a|c | ||
222 | add h, d # d = d + h + S1 + CH + k + w | ||
223 | and c, y2 # y2 = a&c | ||
224 | ## compute low s1 | ||
225 | vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} | ||
226 | and b, y0 # y0 = (a|c)&b | ||
227 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
228 | vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 | ||
229 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
230 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
231 | ROTATE_ARGS | ||
232 | mov e, y0 # y0 = e | ||
233 | mov a, y1 # y1 = a | ||
234 | MY_ROR (25-11), y0 # y0 = e >> (25-11) | ||
235 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
236 | MY_ROR (22-13), y1 # y1 = a >> (22-13) | ||
237 | mov f, y2 # y2 = f | ||
238 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
239 | MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
240 | vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} | ||
241 | xor g, y2 # y2 = f^g | ||
242 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} | ||
243 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
244 | and e, y2 # y2 = (f^g)&e | ||
245 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} | ||
246 | MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
247 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
248 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
249 | MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
250 | vpxor XTMP3, XTMP2, XTMP2 # | ||
251 | add y0, y2 # y2 = S1 + CH | ||
252 | MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
253 | add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH | ||
254 | vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} | ||
255 | mov a, y0 # y0 = a | ||
256 | add y2, h # h = h + S1 + CH + k + w | ||
257 | mov a, y2 # y2 = a | ||
258 | vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} | ||
259 | or c, y0 # y0 = a|c | ||
260 | add h, d # d = d + h + S1 + CH + k + w | ||
261 | and c, y2 # y2 = a&c | ||
262 | vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} | ||
263 | and b, y0 # y0 = (a|c)&b | ||
264 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
265 | ## compute high s1 | ||
266 | vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} | ||
267 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
268 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
269 | ROTATE_ARGS | ||
270 | mov e, y0 # y0 = e | ||
271 | MY_ROR (25-11), y0 # y0 = e >> (25-11) | ||
272 | mov a, y1 # y1 = a | ||
273 | MY_ROR (22-13), y1 # y1 = a >> (22-13) | ||
274 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
275 | mov f, y2 # y2 = f | ||
276 | MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
277 | vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} | ||
278 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
279 | xor g, y2 # y2 = f^g | ||
280 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} | ||
281 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
282 | and e, y2 # y2 = (f^g)&e | ||
283 | MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
284 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} | ||
285 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
286 | MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
287 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
288 | vpxor XTMP3, XTMP2, XTMP2 | ||
289 | MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
290 | add y0, y2 # y2 = S1 + CH | ||
291 | add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH | ||
292 | vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} | ||
293 | mov a, y0 # y0 = a | ||
294 | add y2, h # h = h + S1 + CH + k + w | ||
295 | mov a, y2 # y2 = a | ||
296 | vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} | ||
297 | or c, y0 # y0 = a|c | ||
298 | add h, d # d = d + h + S1 + CH + k + w | ||
299 | and c, y2 # y2 = a&c | ||
300 | vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} | ||
301 | and b, y0 # y0 = (a|c)&b | ||
302 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
303 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
304 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
305 | ROTATE_ARGS | ||
306 | rotate_Xs | ||
307 | .endm | ||
308 | |||
309 | ## input is [rsp + _XFER + %1 * 4] | ||
310 | .macro DO_ROUND round | ||
311 | mov e, y0 # y0 = e | ||
312 | MY_ROR (25-11), y0 # y0 = e >> (25-11) | ||
313 | mov a, y1 # y1 = a | ||
314 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
315 | MY_ROR (22-13), y1 # y1 = a >> (22-13) | ||
316 | mov f, y2 # y2 = f | ||
317 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
318 | MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
319 | xor g, y2 # y2 = f^g | ||
320 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
321 | MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
322 | and e, y2 # y2 = (f^g)&e | ||
323 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
324 | MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
325 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
326 | add y0, y2 # y2 = S1 + CH | ||
327 | MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
328 | offset = \round * 4 + _XFER # | ||
329 | add offset(%rsp), y2 # y2 = k + w + S1 + CH | ||
330 | mov a, y0 # y0 = a | ||
331 | add y2, h # h = h + S1 + CH + k + w | ||
332 | mov a, y2 # y2 = a | ||
333 | or c, y0 # y0 = a|c | ||
334 | add h, d # d = d + h + S1 + CH + k + w | ||
335 | and c, y2 # y2 = a&c | ||
336 | and b, y0 # y0 = (a|c)&b | ||
337 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
338 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
339 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
340 | ROTATE_ARGS | ||
341 | .endm | ||
342 | |||
343 | ######################################################################## | ||
344 | ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) | ||
345 | ## arg 1 : pointer to input data | ||
346 | ## arg 2 : pointer to digest | ||
347 | ## arg 3 : Num blocks | ||
348 | ######################################################################## | ||
349 | .text | ||
350 | ENTRY(sha256_transform_avx) | ||
351 | .align 32 | ||
352 | pushq %rbx | ||
353 | pushq %rbp | ||
354 | pushq %r13 | ||
355 | pushq %r14 | ||
356 | pushq %r15 | ||
357 | pushq %r12 | ||
358 | |||
359 | mov %rsp, %r12 | ||
360 | subq $STACK_SIZE, %rsp # allocate stack space | ||
361 | and $~15, %rsp # align stack pointer | ||
362 | |||
363 | shl $6, NUM_BLKS # convert to bytes | ||
364 | jz done_hash | ||
365 | add INP, NUM_BLKS # pointer to end of data | ||
366 | mov NUM_BLKS, _INP_END(%rsp) | ||
367 | |||
368 | ## load initial digest | ||
369 | mov 4*0(CTX), a | ||
370 | mov 4*1(CTX), b | ||
371 | mov 4*2(CTX), c | ||
372 | mov 4*3(CTX), d | ||
373 | mov 4*4(CTX), e | ||
374 | mov 4*5(CTX), f | ||
375 | mov 4*6(CTX), g | ||
376 | mov 4*7(CTX), h | ||
377 | |||
378 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | ||
379 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | ||
380 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | ||
381 | loop0: | ||
382 | lea K256(%rip), TBL | ||
383 | |||
384 | ## byte swap first 16 dwords | ||
385 | COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK | ||
386 | COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK | ||
387 | COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK | ||
388 | COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK | ||
389 | |||
390 | mov INP, _INP(%rsp) | ||
391 | |||
392 | ## schedule 48 input dwords, by doing 3 rounds of 16 each | ||
393 | mov $3, SRND | ||
394 | .align 16 | ||
395 | loop1: | ||
396 | vpaddd (TBL), X0, XFER | ||
397 | vmovdqa XFER, _XFER(%rsp) | ||
398 | FOUR_ROUNDS_AND_SCHED | ||
399 | |||
400 | vpaddd 1*16(TBL), X0, XFER | ||
401 | vmovdqa XFER, _XFER(%rsp) | ||
402 | FOUR_ROUNDS_AND_SCHED | ||
403 | |||
404 | vpaddd 2*16(TBL), X0, XFER | ||
405 | vmovdqa XFER, _XFER(%rsp) | ||
406 | FOUR_ROUNDS_AND_SCHED | ||
407 | |||
408 | vpaddd 3*16(TBL), X0, XFER | ||
409 | vmovdqa XFER, _XFER(%rsp) | ||
410 | add $4*16, TBL | ||
411 | FOUR_ROUNDS_AND_SCHED | ||
412 | |||
413 | sub $1, SRND | ||
414 | jne loop1 | ||
415 | |||
416 | mov $2, SRND | ||
417 | loop2: | ||
418 | vpaddd (TBL), X0, XFER | ||
419 | vmovdqa XFER, _XFER(%rsp) | ||
420 | DO_ROUND 0 | ||
421 | DO_ROUND 1 | ||
422 | DO_ROUND 2 | ||
423 | DO_ROUND 3 | ||
424 | |||
425 | vpaddd 1*16(TBL), X1, XFER | ||
426 | vmovdqa XFER, _XFER(%rsp) | ||
427 | add $2*16, TBL | ||
428 | DO_ROUND 0 | ||
429 | DO_ROUND 1 | ||
430 | DO_ROUND 2 | ||
431 | DO_ROUND 3 | ||
432 | |||
433 | vmovdqa X2, X0 | ||
434 | vmovdqa X3, X1 | ||
435 | |||
436 | sub $1, SRND | ||
437 | jne loop2 | ||
438 | |||
439 | addm (4*0)(CTX),a | ||
440 | addm (4*1)(CTX),b | ||
441 | addm (4*2)(CTX),c | ||
442 | addm (4*3)(CTX),d | ||
443 | addm (4*4)(CTX),e | ||
444 | addm (4*5)(CTX),f | ||
445 | addm (4*6)(CTX),g | ||
446 | addm (4*7)(CTX),h | ||
447 | |||
448 | mov _INP(%rsp), INP | ||
449 | add $64, INP | ||
450 | cmp _INP_END(%rsp), INP | ||
451 | jne loop0 | ||
452 | |||
453 | done_hash: | ||
454 | |||
455 | mov %r12, %rsp | ||
456 | |||
457 | popq %r12 | ||
458 | popq %r15 | ||
459 | popq %r14 | ||
460 | popq %r13 | ||
461 | popq %rbp | ||
462 | popq %rbx | ||
463 | ret | ||
464 | ENDPROC(sha256_transform_avx) | ||
465 | |||
466 | .data | ||
467 | .align 64 | ||
468 | K256: | ||
469 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
470 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
471 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
472 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
473 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
474 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
475 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
476 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
477 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
478 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
479 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
480 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
481 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
482 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
483 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
484 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
485 | |||
486 | PSHUFFLE_BYTE_FLIP_MASK: | ||
487 | .octa 0x0c0d0e0f08090a0b0405060700010203 | ||
488 | |||
489 | # shuffle xBxA -> 00BA | ||
490 | _SHUF_00BA: | ||
491 | .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 | ||
492 | |||
493 | # shuffle xDxC -> DC00 | ||
494 | _SHUF_DC00: | ||
495 | .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF | ||
496 | #endif | ||
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S new file mode 100644 index 000000000000..9e86944c539d --- /dev/null +++ b/arch/x86/crypto/sha256-avx2-asm.S | |||
@@ -0,0 +1,772 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-256 with AVX2 instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
10 | # | ||
11 | # This software is available to you under a choice of one of two | ||
12 | # licenses. You may choose to be licensed under the terms of the GNU | ||
13 | # General Public License (GPL) Version 2, available from the file | ||
14 | # COPYING in the main directory of this source tree, or the | ||
15 | # OpenIB.org BSD license below: | ||
16 | # | ||
17 | # Redistribution and use in source and binary forms, with or | ||
18 | # without modification, are permitted provided that the following | ||
19 | # conditions are met: | ||
20 | # | ||
21 | # - Redistributions of source code must retain the above | ||
22 | # copyright notice, this list of conditions and the following | ||
23 | # disclaimer. | ||
24 | # | ||
25 | # - Redistributions in binary form must reproduce the above | ||
26 | # copyright notice, this list of conditions and the following | ||
27 | # disclaimer in the documentation and/or other materials | ||
28 | # provided with the distribution. | ||
29 | # | ||
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
31 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
32 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
33 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
34 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
35 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
36 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
37 | # SOFTWARE. | ||
38 | # | ||
39 | ######################################################################## | ||
40 | # | ||
41 | # This code is described in an Intel White-Paper: | ||
42 | # "Fast SHA-256 Implementations on Intel Architecture Processors" | ||
43 | # | ||
44 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
45 | # and search for that title. | ||
46 | # | ||
47 | ######################################################################## | ||
48 | # This code schedules 2 blocks at a time, with 4 lanes per block | ||
49 | ######################################################################## | ||
50 | |||
51 | #ifdef CONFIG_AS_AVX2 | ||
52 | #include <linux/linkage.h> | ||
53 | |||
54 | ## assume buffers not aligned | ||
55 | #define VMOVDQ vmovdqu | ||
56 | |||
57 | ################################ Define Macros | ||
58 | |||
59 | # addm [mem], reg | ||
60 | # Add reg to mem using reg-mem add and store | ||
61 | .macro addm p1 p2 | ||
62 | add \p1, \p2 | ||
63 | mov \p2, \p1 | ||
64 | .endm | ||
65 | |||
66 | ################################ | ||
67 | |||
68 | X0 = %ymm4 | ||
69 | X1 = %ymm5 | ||
70 | X2 = %ymm6 | ||
71 | X3 = %ymm7 | ||
72 | |||
73 | # XMM versions of above | ||
74 | XWORD0 = %xmm4 | ||
75 | XWORD1 = %xmm5 | ||
76 | XWORD2 = %xmm6 | ||
77 | XWORD3 = %xmm7 | ||
78 | |||
79 | XTMP0 = %ymm0 | ||
80 | XTMP1 = %ymm1 | ||
81 | XTMP2 = %ymm2 | ||
82 | XTMP3 = %ymm3 | ||
83 | XTMP4 = %ymm8 | ||
84 | XFER = %ymm9 | ||
85 | XTMP5 = %ymm11 | ||
86 | |||
87 | SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA | ||
88 | SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 | ||
89 | BYTE_FLIP_MASK = %ymm13 | ||
90 | |||
91 | X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK | ||
92 | |||
93 | NUM_BLKS = %rdx # 3rd arg | ||
94 | CTX = %rsi # 2nd arg | ||
95 | INP = %rdi # 1st arg | ||
96 | c = %ecx | ||
97 | d = %r8d | ||
98 | e = %edx # clobbers NUM_BLKS | ||
99 | y3 = %edi # clobbers INP | ||
100 | |||
101 | |||
102 | TBL = %rbp | ||
103 | SRND = CTX # SRND is same register as CTX | ||
104 | |||
105 | a = %eax | ||
106 | b = %ebx | ||
107 | f = %r9d | ||
108 | g = %r10d | ||
109 | h = %r11d | ||
110 | old_h = %r11d | ||
111 | |||
112 | T1 = %r12d | ||
113 | y0 = %r13d | ||
114 | y1 = %r14d | ||
115 | y2 = %r15d | ||
116 | |||
117 | |||
118 | _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round | ||
119 | _XMM_SAVE_SIZE = 0 | ||
120 | _INP_END_SIZE = 8 | ||
121 | _INP_SIZE = 8 | ||
122 | _CTX_SIZE = 8 | ||
123 | _RSP_SIZE = 8 | ||
124 | |||
125 | _XFER = 0 | ||
126 | _XMM_SAVE = _XFER + _XFER_SIZE | ||
127 | _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE | ||
128 | _INP = _INP_END + _INP_END_SIZE | ||
129 | _CTX = _INP + _INP_SIZE | ||
130 | _RSP = _CTX + _CTX_SIZE | ||
131 | STACK_SIZE = _RSP + _RSP_SIZE | ||
132 | |||
133 | # rotate_Xs | ||
134 | # Rotate values of symbols X0...X3 | ||
135 | .macro rotate_Xs | ||
136 | X_ = X0 | ||
137 | X0 = X1 | ||
138 | X1 = X2 | ||
139 | X2 = X3 | ||
140 | X3 = X_ | ||
141 | .endm | ||
142 | |||
143 | # ROTATE_ARGS | ||
144 | # Rotate values of symbols a...h | ||
145 | .macro ROTATE_ARGS | ||
146 | old_h = h | ||
147 | TMP_ = h | ||
148 | h = g | ||
149 | g = f | ||
150 | f = e | ||
151 | e = d | ||
152 | d = c | ||
153 | c = b | ||
154 | b = a | ||
155 | a = TMP_ | ||
156 | .endm | ||
157 | |||
158 | .macro FOUR_ROUNDS_AND_SCHED disp | ||
159 | ################################### RND N + 0 ############################ | ||
160 | |||
161 | mov a, y3 # y3 = a # MAJA | ||
162 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
163 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
164 | |||
165 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | ||
166 | or c, y3 # y3 = a|c # MAJA | ||
167 | vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] | ||
168 | mov f, y2 # y2 = f # CH | ||
169 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
170 | |||
171 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
172 | xor g, y2 # y2 = f^g # CH | ||
173 | vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 | ||
174 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
175 | |||
176 | and e, y2 # y2 = (f^g)&e # CH | ||
177 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
178 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
179 | add h, d # d = k + w + h + d # -- | ||
180 | |||
181 | and b, y3 # y3 = (a|c)&b # MAJA | ||
182 | vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] | ||
183 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
184 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
185 | |||
186 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
187 | vpsrld $7, XTMP1, XTMP2 | ||
188 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
189 | mov a, T1 # T1 = a # MAJB | ||
190 | and c, T1 # T1 = a&c # MAJB | ||
191 | |||
192 | add y0, y2 # y2 = S1 + CH # -- | ||
193 | vpslld $(32-7), XTMP1, XTMP3 | ||
194 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
195 | add y1, h # h = k + w + h + S0 # -- | ||
196 | |||
197 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
198 | vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 | ||
199 | |||
200 | vpsrld $18, XTMP1, XTMP2 | ||
201 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
202 | add y3, h # h = t1 + S0 + MAJ # -- | ||
203 | |||
204 | |||
205 | ROTATE_ARGS | ||
206 | |||
207 | ################################### RND N + 1 ############################ | ||
208 | |||
209 | mov a, y3 # y3 = a # MAJA | ||
210 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
211 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
212 | offset = \disp + 1*4 | ||
213 | addl offset(%rsp, SRND), h # h = k + w + h # -- | ||
214 | or c, y3 # y3 = a|c # MAJA | ||
215 | |||
216 | |||
217 | vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 | ||
218 | mov f, y2 # y2 = f # CH | ||
219 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
220 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
221 | xor g, y2 # y2 = f^g # CH | ||
222 | |||
223 | |||
224 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
225 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
226 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
227 | and e, y2 # y2 = (f^g)&e # CH | ||
228 | add h, d # d = k + w + h + d # -- | ||
229 | |||
230 | vpslld $(32-18), XTMP1, XTMP1 | ||
231 | and b, y3 # y3 = (a|c)&b # MAJA | ||
232 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
233 | |||
234 | vpxor XTMP1, XTMP3, XTMP3 | ||
235 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
236 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
237 | |||
238 | vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 | ||
239 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
240 | mov a, T1 # T1 = a # MAJB | ||
241 | and c, T1 # T1 = a&c # MAJB | ||
242 | add y0, y2 # y2 = S1 + CH # -- | ||
243 | |||
244 | vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 | ||
245 | vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} | ||
246 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
247 | add y1, h # h = k + w + h + S0 # -- | ||
248 | |||
249 | vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 | ||
250 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
251 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
252 | add y3, h # h = t1 + S0 + MAJ # -- | ||
253 | |||
254 | vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} | ||
255 | |||
256 | |||
257 | ROTATE_ARGS | ||
258 | |||
259 | ################################### RND N + 2 ############################ | ||
260 | |||
261 | mov a, y3 # y3 = a # MAJA | ||
262 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
263 | offset = \disp + 2*4 | ||
264 | addl offset(%rsp, SRND), h # h = k + w + h # -- | ||
265 | |||
266 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} | ||
267 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
268 | or c, y3 # y3 = a|c # MAJA | ||
269 | mov f, y2 # y2 = f # CH | ||
270 | xor g, y2 # y2 = f^g # CH | ||
271 | |||
272 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
273 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
274 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} | ||
275 | and e, y2 # y2 = (f^g)&e # CH | ||
276 | |||
277 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
278 | vpxor XTMP3, XTMP2, XTMP2 | ||
279 | add h, d # d = k + w + h + d # -- | ||
280 | and b, y3 # y3 = (a|c)&b # MAJA | ||
281 | |||
282 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
283 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
284 | vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} | ||
285 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
286 | |||
287 | vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} | ||
288 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
289 | rorx $2, a ,T1 # T1 = (a >> 2) # S0 | ||
290 | vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} | ||
291 | |||
292 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
293 | mov a, T1 # T1 = a # MAJB | ||
294 | and c, T1 # T1 = a&c # MAJB | ||
295 | add y0, y2 # y2 = S1 + CH # -- | ||
296 | vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} | ||
297 | |||
298 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
299 | add y1,h # h = k + w + h + S0 # -- | ||
300 | add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
301 | add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
302 | |||
303 | add y3,h # h = t1 + S0 + MAJ # -- | ||
304 | |||
305 | |||
306 | ROTATE_ARGS | ||
307 | |||
308 | ################################### RND N + 3 ############################ | ||
309 | |||
310 | mov a, y3 # y3 = a # MAJA | ||
311 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
312 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
313 | offset = \disp + 3*4 | ||
314 | addl offset(%rsp, SRND), h # h = k + w + h # -- | ||
315 | or c, y3 # y3 = a|c # MAJA | ||
316 | |||
317 | |||
318 | vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} | ||
319 | mov f, y2 # y2 = f # CH | ||
320 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
321 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
322 | xor g, y2 # y2 = f^g # CH | ||
323 | |||
324 | |||
325 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} | ||
326 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
327 | and e, y2 # y2 = (f^g)&e # CH | ||
328 | add h, d # d = k + w + h + d # -- | ||
329 | and b, y3 # y3 = (a|c)&b # MAJA | ||
330 | |||
331 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} | ||
332 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
333 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
334 | |||
335 | vpxor XTMP3, XTMP2, XTMP2 | ||
336 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
337 | add y0, y2 # y2 = S1 + CH # -- | ||
338 | |||
339 | vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} | ||
340 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
341 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
342 | |||
343 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
344 | vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} | ||
345 | |||
346 | vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} | ||
347 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
348 | mov a, T1 # T1 = a # MAJB | ||
349 | and c, T1 # T1 = a&c # MAJB | ||
350 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
351 | |||
352 | add y1, h # h = k + w + h + S0 # -- | ||
353 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
354 | add y3, h # h = t1 + S0 + MAJ # -- | ||
355 | |||
356 | ROTATE_ARGS | ||
357 | rotate_Xs | ||
358 | .endm | ||
359 | |||
360 | .macro DO_4ROUNDS disp | ||
361 | ################################### RND N + 0 ########################### | ||
362 | |||
363 | mov f, y2 # y2 = f # CH | ||
364 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
365 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
366 | xor g, y2 # y2 = f^g # CH | ||
367 | |||
368 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
369 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
370 | and e, y2 # y2 = (f^g)&e # CH | ||
371 | |||
372 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
373 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
374 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
375 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
376 | mov a, y3 # y3 = a # MAJA | ||
377 | |||
378 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
379 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
380 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | ||
381 | or c, y3 # y3 = a|c # MAJA | ||
382 | |||
383 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
384 | mov a, T1 # T1 = a # MAJB | ||
385 | and b, y3 # y3 = (a|c)&b # MAJA | ||
386 | and c, T1 # T1 = a&c # MAJB | ||
387 | add y0, y2 # y2 = S1 + CH # -- | ||
388 | |||
389 | |||
390 | add h, d # d = k + w + h + d # -- | ||
391 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
392 | add y1, h # h = k + w + h + S0 # -- | ||
393 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
394 | |||
395 | ROTATE_ARGS | ||
396 | |||
397 | ################################### RND N + 1 ########################### | ||
398 | |||
399 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
400 | mov f, y2 # y2 = f # CH | ||
401 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
402 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
403 | xor g, y2 # y2 = f^g # CH | ||
404 | |||
405 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
406 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
407 | and e, y2 # y2 = (f^g)&e # CH | ||
408 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
409 | |||
410 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
411 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
412 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
413 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
414 | mov a, y3 # y3 = a # MAJA | ||
415 | |||
416 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
417 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
418 | offset = 4*1 + \disp | ||
419 | addl offset(%rsp, SRND), h # h = k + w + h # -- | ||
420 | or c, y3 # y3 = a|c # MAJA | ||
421 | |||
422 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
423 | mov a, T1 # T1 = a # MAJB | ||
424 | and b, y3 # y3 = (a|c)&b # MAJA | ||
425 | and c, T1 # T1 = a&c # MAJB | ||
426 | add y0, y2 # y2 = S1 + CH # -- | ||
427 | |||
428 | |||
429 | add h, d # d = k + w + h + d # -- | ||
430 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
431 | add y1, h # h = k + w + h + S0 # -- | ||
432 | |||
433 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
434 | |||
435 | ROTATE_ARGS | ||
436 | |||
437 | ################################### RND N + 2 ############################## | ||
438 | |||
439 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
440 | mov f, y2 # y2 = f # CH | ||
441 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
442 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
443 | xor g, y2 # y2 = f^g # CH | ||
444 | |||
445 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
446 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
447 | and e, y2 # y2 = (f^g)&e # CH | ||
448 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
449 | |||
450 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
451 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
452 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
453 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
454 | mov a, y3 # y3 = a # MAJA | ||
455 | |||
456 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
457 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
458 | offset = 4*2 + \disp | ||
459 | addl offset(%rsp, SRND), h # h = k + w + h # -- | ||
460 | or c, y3 # y3 = a|c # MAJA | ||
461 | |||
462 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
463 | mov a, T1 # T1 = a # MAJB | ||
464 | and b, y3 # y3 = (a|c)&b # MAJA | ||
465 | and c, T1 # T1 = a&c # MAJB | ||
466 | add y0, y2 # y2 = S1 + CH # -- | ||
467 | |||
468 | |||
469 | add h, d # d = k + w + h + d # -- | ||
470 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
471 | add y1, h # h = k + w + h + S0 # -- | ||
472 | |||
473 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
474 | |||
475 | ROTATE_ARGS | ||
476 | |||
477 | ################################### RND N + 3 ########################### | ||
478 | |||
479 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
480 | mov f, y2 # y2 = f # CH | ||
481 | rorx $25, e, y0 # y0 = e >> 25 # S1A | ||
482 | rorx $11, e, y1 # y1 = e >> 11 # S1B | ||
483 | xor g, y2 # y2 = f^g # CH | ||
484 | |||
485 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | ||
486 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | ||
487 | and e, y2 # y2 = (f^g)&e # CH | ||
488 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
489 | |||
490 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | ||
491 | rorx $13, a, T1 # T1 = a >> 13 # S0B | ||
492 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
493 | rorx $22, a, y1 # y1 = a >> 22 # S0A | ||
494 | mov a, y3 # y3 = a # MAJA | ||
495 | |||
496 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | ||
497 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | ||
498 | offset = 4*3 + \disp | ||
499 | addl offset(%rsp, SRND), h # h = k + w + h # -- | ||
500 | or c, y3 # y3 = a|c # MAJA | ||
501 | |||
502 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | ||
503 | mov a, T1 # T1 = a # MAJB | ||
504 | and b, y3 # y3 = (a|c)&b # MAJA | ||
505 | and c, T1 # T1 = a&c # MAJB | ||
506 | add y0, y2 # y2 = S1 + CH # -- | ||
507 | |||
508 | |||
509 | add h, d # d = k + w + h + d # -- | ||
510 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
511 | add y1, h # h = k + w + h + S0 # -- | ||
512 | |||
513 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
514 | |||
515 | |||
516 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
517 | |||
518 | add y3, h # h = t1 + S0 + MAJ # -- | ||
519 | |||
520 | ROTATE_ARGS | ||
521 | |||
522 | .endm | ||
523 | |||
524 | ######################################################################## | ||
525 | ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) | ||
526 | ## arg 1 : pointer to input data | ||
527 | ## arg 2 : pointer to digest | ||
528 | ## arg 3 : Num blocks | ||
529 | ######################################################################## | ||
530 | .text | ||
531 | ENTRY(sha256_transform_rorx) | ||
532 | .align 32 | ||
533 | pushq %rbx | ||
534 | pushq %rbp | ||
535 | pushq %r12 | ||
536 | pushq %r13 | ||
537 | pushq %r14 | ||
538 | pushq %r15 | ||
539 | |||
540 | mov %rsp, %rax | ||
541 | subq $STACK_SIZE, %rsp | ||
542 | and $-32, %rsp # align rsp to 32 byte boundary | ||
543 | mov %rax, _RSP(%rsp) | ||
544 | |||
545 | |||
546 | shl $6, NUM_BLKS # convert to bytes | ||
547 | jz done_hash | ||
548 | lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block | ||
549 | mov NUM_BLKS, _INP_END(%rsp) | ||
550 | |||
551 | cmp NUM_BLKS, INP | ||
552 | je only_one_block | ||
553 | |||
554 | ## load initial digest | ||
555 | mov (CTX), a | ||
556 | mov 4*1(CTX), b | ||
557 | mov 4*2(CTX), c | ||
558 | mov 4*3(CTX), d | ||
559 | mov 4*4(CTX), e | ||
560 | mov 4*5(CTX), f | ||
561 | mov 4*6(CTX), g | ||
562 | mov 4*7(CTX), h | ||
563 | |||
564 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | ||
565 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | ||
566 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | ||
567 | |||
568 | mov CTX, _CTX(%rsp) | ||
569 | |||
570 | loop0: | ||
571 | lea K256(%rip), TBL | ||
572 | |||
573 | ## Load first 16 dwords from two blocks | ||
574 | VMOVDQ 0*32(INP),XTMP0 | ||
575 | VMOVDQ 1*32(INP),XTMP1 | ||
576 | VMOVDQ 2*32(INP),XTMP2 | ||
577 | VMOVDQ 3*32(INP),XTMP3 | ||
578 | |||
579 | ## byte swap data | ||
580 | vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 | ||
581 | vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 | ||
582 | vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 | ||
583 | vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 | ||
584 | |||
585 | ## transpose data into high/low halves | ||
586 | vperm2i128 $0x20, XTMP2, XTMP0, X0 | ||
587 | vperm2i128 $0x31, XTMP2, XTMP0, X1 | ||
588 | vperm2i128 $0x20, XTMP3, XTMP1, X2 | ||
589 | vperm2i128 $0x31, XTMP3, XTMP1, X3 | ||
590 | |||
591 | last_block_enter: | ||
592 | add $64, INP | ||
593 | mov INP, _INP(%rsp) | ||
594 | |||
595 | ## schedule 48 input dwords, by doing 3 rounds of 12 each | ||
596 | xor SRND, SRND | ||
597 | |||
598 | .align 16 | ||
599 | loop1: | ||
600 | vpaddd 0*32(TBL, SRND), X0, XFER | ||
601 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) | ||
602 | FOUR_ROUNDS_AND_SCHED _XFER + 0*32 | ||
603 | |||
604 | vpaddd 1*32(TBL, SRND), X0, XFER | ||
605 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) | ||
606 | FOUR_ROUNDS_AND_SCHED _XFER + 1*32 | ||
607 | |||
608 | vpaddd 2*32(TBL, SRND), X0, XFER | ||
609 | vmovdqa XFER, 2*32+_XFER(%rsp, SRND) | ||
610 | FOUR_ROUNDS_AND_SCHED _XFER + 2*32 | ||
611 | |||
612 | vpaddd 3*32(TBL, SRND), X0, XFER | ||
613 | vmovdqa XFER, 3*32+_XFER(%rsp, SRND) | ||
614 | FOUR_ROUNDS_AND_SCHED _XFER + 3*32 | ||
615 | |||
616 | add $4*32, SRND | ||
617 | cmp $3*4*32, SRND | ||
618 | jb loop1 | ||
619 | |||
620 | loop2: | ||
621 | ## Do last 16 rounds with no scheduling | ||
622 | vpaddd 0*32(TBL, SRND), X0, XFER | ||
623 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) | ||
624 | DO_4ROUNDS _XFER + 0*32 | ||
625 | vpaddd 1*32(TBL, SRND), X1, XFER | ||
626 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) | ||
627 | DO_4ROUNDS _XFER + 1*32 | ||
628 | add $2*32, SRND | ||
629 | |||
630 | vmovdqa X2, X0 | ||
631 | vmovdqa X3, X1 | ||
632 | |||
633 | cmp $4*4*32, SRND | ||
634 | jb loop2 | ||
635 | |||
636 | mov _CTX(%rsp), CTX | ||
637 | mov _INP(%rsp), INP | ||
638 | |||
639 | addm (4*0)(CTX),a | ||
640 | addm (4*1)(CTX),b | ||
641 | addm (4*2)(CTX),c | ||
642 | addm (4*3)(CTX),d | ||
643 | addm (4*4)(CTX),e | ||
644 | addm (4*5)(CTX),f | ||
645 | addm (4*6)(CTX),g | ||
646 | addm (4*7)(CTX),h | ||
647 | |||
648 | cmp _INP_END(%rsp), INP | ||
649 | ja done_hash | ||
650 | |||
651 | #### Do second block using previously scheduled results | ||
652 | xor SRND, SRND | ||
653 | .align 16 | ||
654 | loop3: | ||
655 | DO_4ROUNDS _XFER + 0*32 + 16 | ||
656 | DO_4ROUNDS _XFER + 1*32 + 16 | ||
657 | add $2*32, SRND | ||
658 | cmp $4*4*32, SRND | ||
659 | jb loop3 | ||
660 | |||
661 | mov _CTX(%rsp), CTX | ||
662 | mov _INP(%rsp), INP | ||
663 | add $64, INP | ||
664 | |||
665 | addm (4*0)(CTX),a | ||
666 | addm (4*1)(CTX),b | ||
667 | addm (4*2)(CTX),c | ||
668 | addm (4*3)(CTX),d | ||
669 | addm (4*4)(CTX),e | ||
670 | addm (4*5)(CTX),f | ||
671 | addm (4*6)(CTX),g | ||
672 | addm (4*7)(CTX),h | ||
673 | |||
674 | cmp _INP_END(%rsp), INP | ||
675 | jb loop0 | ||
676 | ja done_hash | ||
677 | |||
678 | do_last_block: | ||
679 | #### do last block | ||
680 | lea K256(%rip), TBL | ||
681 | |||
682 | VMOVDQ 0*16(INP),XWORD0 | ||
683 | VMOVDQ 1*16(INP),XWORD1 | ||
684 | VMOVDQ 2*16(INP),XWORD2 | ||
685 | VMOVDQ 3*16(INP),XWORD3 | ||
686 | |||
687 | vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 | ||
688 | vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 | ||
689 | vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 | ||
690 | vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 | ||
691 | |||
692 | jmp last_block_enter | ||
693 | |||
694 | only_one_block: | ||
695 | |||
696 | ## load initial digest | ||
697 | mov (4*0)(CTX),a | ||
698 | mov (4*1)(CTX),b | ||
699 | mov (4*2)(CTX),c | ||
700 | mov (4*3)(CTX),d | ||
701 | mov (4*4)(CTX),e | ||
702 | mov (4*5)(CTX),f | ||
703 | mov (4*6)(CTX),g | ||
704 | mov (4*7)(CTX),h | ||
705 | |||
706 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | ||
707 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | ||
708 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | ||
709 | |||
710 | mov CTX, _CTX(%rsp) | ||
711 | jmp do_last_block | ||
712 | |||
713 | done_hash: | ||
714 | |||
715 | mov _RSP(%rsp), %rsp | ||
716 | |||
717 | popq %r15 | ||
718 | popq %r14 | ||
719 | popq %r13 | ||
720 | popq %r12 | ||
721 | popq %rbp | ||
722 | popq %rbx | ||
723 | ret | ||
724 | ENDPROC(sha256_transform_rorx) | ||
725 | |||
726 | .data | ||
727 | .align 64 | ||
728 | K256: | ||
729 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
730 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
731 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
732 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
733 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
734 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
735 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
736 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
737 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
738 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
739 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
740 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
741 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
742 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
743 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
744 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
745 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
746 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
747 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
748 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
749 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
750 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
751 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
752 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
753 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
754 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
755 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
756 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
757 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
758 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
759 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
760 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
761 | |||
762 | PSHUFFLE_BYTE_FLIP_MASK: | ||
763 | .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 | ||
764 | |||
765 | # shuffle xBxA -> 00BA | ||
766 | _SHUF_00BA: | ||
767 | .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 | ||
768 | |||
769 | # shuffle xDxC -> DC00 | ||
770 | _SHUF_DC00: | ||
771 | .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF | ||
772 | #endif | ||
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S new file mode 100644 index 000000000000..98d3c391da81 --- /dev/null +++ b/arch/x86/crypto/sha256-ssse3-asm.S | |||
@@ -0,0 +1,506 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-256 with SSSE3 instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
10 | # | ||
11 | # This software is available to you under a choice of one of two | ||
12 | # licenses. You may choose to be licensed under the terms of the GNU | ||
13 | # General Public License (GPL) Version 2, available from the file | ||
14 | # COPYING in the main directory of this source tree, or the | ||
15 | # OpenIB.org BSD license below: | ||
16 | # | ||
17 | # Redistribution and use in source and binary forms, with or | ||
18 | # without modification, are permitted provided that the following | ||
19 | # conditions are met: | ||
20 | # | ||
21 | # - Redistributions of source code must retain the above | ||
22 | # copyright notice, this list of conditions and the following | ||
23 | # disclaimer. | ||
24 | # | ||
25 | # - Redistributions in binary form must reproduce the above | ||
26 | # copyright notice, this list of conditions and the following | ||
27 | # disclaimer in the documentation and/or other materials | ||
28 | # provided with the distribution. | ||
29 | # | ||
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
31 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
32 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
33 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
34 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
35 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
36 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
37 | # SOFTWARE. | ||
38 | # | ||
39 | ######################################################################## | ||
40 | # | ||
41 | # This code is described in an Intel White-Paper: | ||
42 | # "Fast SHA-256 Implementations on Intel Architecture Processors" | ||
43 | # | ||
44 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
45 | # and search for that title. | ||
46 | # | ||
47 | ######################################################################## | ||
48 | |||
49 | #include <linux/linkage.h> | ||
50 | |||
51 | ## assume buffers not aligned | ||
52 | #define MOVDQ movdqu | ||
53 | |||
54 | ################################ Define Macros | ||
55 | |||
56 | # addm [mem], reg | ||
57 | # Add reg to mem using reg-mem add and store | ||
58 | .macro addm p1 p2 | ||
59 | add \p1, \p2 | ||
60 | mov \p2, \p1 | ||
61 | .endm | ||
62 | |||
63 | ################################ | ||
64 | |||
65 | # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask | ||
66 | # Load xmm with mem and byte swap each dword | ||
67 | .macro COPY_XMM_AND_BSWAP p1 p2 p3 | ||
68 | MOVDQ \p2, \p1 | ||
69 | pshufb \p3, \p1 | ||
70 | .endm | ||
71 | |||
72 | ################################ | ||
73 | |||
74 | X0 = %xmm4 | ||
75 | X1 = %xmm5 | ||
76 | X2 = %xmm6 | ||
77 | X3 = %xmm7 | ||
78 | |||
79 | XTMP0 = %xmm0 | ||
80 | XTMP1 = %xmm1 | ||
81 | XTMP2 = %xmm2 | ||
82 | XTMP3 = %xmm3 | ||
83 | XTMP4 = %xmm8 | ||
84 | XFER = %xmm9 | ||
85 | |||
86 | SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA | ||
87 | SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 | ||
88 | BYTE_FLIP_MASK = %xmm12 | ||
89 | |||
90 | NUM_BLKS = %rdx # 3rd arg | ||
91 | CTX = %rsi # 2nd arg | ||
92 | INP = %rdi # 1st arg | ||
93 | |||
94 | SRND = %rdi # clobbers INP | ||
95 | c = %ecx | ||
96 | d = %r8d | ||
97 | e = %edx | ||
98 | TBL = %rbp | ||
99 | a = %eax | ||
100 | b = %ebx | ||
101 | |||
102 | f = %r9d | ||
103 | g = %r10d | ||
104 | h = %r11d | ||
105 | |||
106 | y0 = %r13d | ||
107 | y1 = %r14d | ||
108 | y2 = %r15d | ||
109 | |||
110 | |||
111 | |||
112 | _INP_END_SIZE = 8 | ||
113 | _INP_SIZE = 8 | ||
114 | _XFER_SIZE = 8 | ||
115 | _XMM_SAVE_SIZE = 0 | ||
116 | |||
117 | _INP_END = 0 | ||
118 | _INP = _INP_END + _INP_END_SIZE | ||
119 | _XFER = _INP + _INP_SIZE | ||
120 | _XMM_SAVE = _XFER + _XFER_SIZE | ||
121 | STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE | ||
122 | |||
123 | # rotate_Xs | ||
124 | # Rotate values of symbols X0...X3 | ||
125 | .macro rotate_Xs | ||
126 | X_ = X0 | ||
127 | X0 = X1 | ||
128 | X1 = X2 | ||
129 | X2 = X3 | ||
130 | X3 = X_ | ||
131 | .endm | ||
132 | |||
133 | # ROTATE_ARGS | ||
134 | # Rotate values of symbols a...h | ||
135 | .macro ROTATE_ARGS | ||
136 | TMP_ = h | ||
137 | h = g | ||
138 | g = f | ||
139 | f = e | ||
140 | e = d | ||
141 | d = c | ||
142 | c = b | ||
143 | b = a | ||
144 | a = TMP_ | ||
145 | .endm | ||
146 | |||
147 | .macro FOUR_ROUNDS_AND_SCHED | ||
148 | ## compute s0 four at a time and s1 two at a time | ||
149 | ## compute W[-16] + W[-7] 4 at a time | ||
150 | movdqa X3, XTMP0 | ||
151 | mov e, y0 # y0 = e | ||
152 | ror $(25-11), y0 # y0 = e >> (25-11) | ||
153 | mov a, y1 # y1 = a | ||
154 | palignr $4, X2, XTMP0 # XTMP0 = W[-7] | ||
155 | ror $(22-13), y1 # y1 = a >> (22-13) | ||
156 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
157 | mov f, y2 # y2 = f | ||
158 | ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
159 | movdqa X1, XTMP1 | ||
160 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
161 | xor g, y2 # y2 = f^g | ||
162 | paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] | ||
163 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
164 | and e, y2 # y2 = (f^g)&e | ||
165 | ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
166 | ## compute s0 | ||
167 | palignr $4, X0, XTMP1 # XTMP1 = W[-15] | ||
168 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
169 | ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
170 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
171 | movdqa XTMP1, XTMP2 # XTMP2 = W[-15] | ||
172 | ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
173 | add y0, y2 # y2 = S1 + CH | ||
174 | add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH | ||
175 | movdqa XTMP1, XTMP3 # XTMP3 = W[-15] | ||
176 | mov a, y0 # y0 = a | ||
177 | add y2, h # h = h + S1 + CH + k + w | ||
178 | mov a, y2 # y2 = a | ||
179 | pslld $(32-7), XTMP1 # | ||
180 | or c, y0 # y0 = a|c | ||
181 | add h, d # d = d + h + S1 + CH + k + w | ||
182 | and c, y2 # y2 = a&c | ||
183 | psrld $7, XTMP2 # | ||
184 | and b, y0 # y0 = (a|c)&b | ||
185 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
186 | por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 | ||
187 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
188 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
189 | # | ||
190 | ROTATE_ARGS # | ||
191 | movdqa XTMP3, XTMP2 # XTMP2 = W[-15] | ||
192 | mov e, y0 # y0 = e | ||
193 | mov a, y1 # y1 = a | ||
194 | movdqa XTMP3, XTMP4 # XTMP4 = W[-15] | ||
195 | ror $(25-11), y0 # y0 = e >> (25-11) | ||
196 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
197 | mov f, y2 # y2 = f | ||
198 | ror $(22-13), y1 # y1 = a >> (22-13) | ||
199 | pslld $(32-18), XTMP3 # | ||
200 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
201 | ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
202 | xor g, y2 # y2 = f^g | ||
203 | psrld $18, XTMP2 # | ||
204 | ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
205 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
206 | and e, y2 # y2 = (f^g)&e | ||
207 | ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
208 | pxor XTMP3, XTMP1 | ||
209 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
210 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
211 | psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 | ||
212 | add y0, y2 # y2 = S1 + CH | ||
213 | add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH | ||
214 | ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
215 | pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 | ||
216 | mov a, y0 # y0 = a | ||
217 | add y2, h # h = h + S1 + CH + k + w | ||
218 | mov a, y2 # y2 = a | ||
219 | pxor XTMP4, XTMP1 # XTMP1 = s0 | ||
220 | or c, y0 # y0 = a|c | ||
221 | add h, d # d = d + h + S1 + CH + k + w | ||
222 | and c, y2 # y2 = a&c | ||
223 | ## compute low s1 | ||
224 | pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} | ||
225 | and b, y0 # y0 = (a|c)&b | ||
226 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
227 | paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 | ||
228 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
229 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
230 | |||
231 | ROTATE_ARGS | ||
232 | movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} | ||
233 | mov e, y0 # y0 = e | ||
234 | mov a, y1 # y1 = a | ||
235 | ror $(25-11), y0 # y0 = e >> (25-11) | ||
236 | movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} | ||
237 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
238 | ror $(22-13), y1 # y1 = a >> (22-13) | ||
239 | mov f, y2 # y2 = f | ||
240 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
241 | ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
242 | psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} | ||
243 | xor g, y2 # y2 = f^g | ||
244 | psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} | ||
245 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
246 | and e, y2 # y2 = (f^g)&e | ||
247 | psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} | ||
248 | ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
249 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
250 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
251 | ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
252 | pxor XTMP3, XTMP2 | ||
253 | add y0, y2 # y2 = S1 + CH | ||
254 | ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
255 | add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH | ||
256 | pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} | ||
257 | mov a, y0 # y0 = a | ||
258 | add y2, h # h = h + S1 + CH + k + w | ||
259 | mov a, y2 # y2 = a | ||
260 | pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} | ||
261 | or c, y0 # y0 = a|c | ||
262 | add h, d # d = d + h + S1 + CH + k + w | ||
263 | and c, y2 # y2 = a&c | ||
264 | paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} | ||
265 | and b, y0 # y0 = (a|c)&b | ||
266 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
267 | ## compute high s1 | ||
268 | pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} | ||
269 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
270 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
271 | # | ||
272 | ROTATE_ARGS # | ||
273 | movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} | ||
274 | mov e, y0 # y0 = e | ||
275 | ror $(25-11), y0 # y0 = e >> (25-11) | ||
276 | mov a, y1 # y1 = a | ||
277 | movdqa XTMP2, X0 # X0 = W[-2] {DDCC} | ||
278 | ror $(22-13), y1 # y1 = a >> (22-13) | ||
279 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
280 | mov f, y2 # y2 = f | ||
281 | ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
282 | psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} | ||
283 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
284 | xor g, y2 # y2 = f^g | ||
285 | psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} | ||
286 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 | ||
287 | and e, y2 # y2 = (f^g)&e | ||
288 | ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
289 | psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} | ||
290 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 | ||
291 | ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 | ||
292 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
293 | pxor XTMP3, XTMP2 # | ||
294 | ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 | ||
295 | add y0, y2 # y2 = S1 + CH | ||
296 | add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH | ||
297 | pxor XTMP2, X0 # X0 = s1 {xDxC} | ||
298 | mov a, y0 # y0 = a | ||
299 | add y2, h # h = h + S1 + CH + k + w | ||
300 | mov a, y2 # y2 = a | ||
301 | pshufb SHUF_DC00, X0 # X0 = s1 {DC00} | ||
302 | or c, y0 # y0 = a|c | ||
303 | add h, d # d = d + h + S1 + CH + k + w | ||
304 | and c, y2 # y2 = a&c | ||
305 | paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} | ||
306 | and b, y0 # y0 = (a|c)&b | ||
307 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
308 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
309 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
310 | |||
311 | ROTATE_ARGS | ||
312 | rotate_Xs | ||
313 | .endm | ||
314 | |||
315 | ## input is [rsp + _XFER + %1 * 4] | ||
316 | .macro DO_ROUND round | ||
317 | mov e, y0 # y0 = e | ||
318 | ror $(25-11), y0 # y0 = e >> (25-11) | ||
319 | mov a, y1 # y1 = a | ||
320 | xor e, y0 # y0 = e ^ (e >> (25-11)) | ||
321 | ror $(22-13), y1 # y1 = a >> (22-13) | ||
322 | mov f, y2 # y2 = f | ||
323 | xor a, y1 # y1 = a ^ (a >> (22-13) | ||
324 | ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) | ||
325 | xor g, y2 # y2 = f^g | ||
326 | xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | ||
327 | ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) | ||
328 | and e, y2 # y2 = (f^g)&e | ||
329 | xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | ||
330 | ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | ||
331 | xor g, y2 # y2 = CH = ((f^g)&e)^g | ||
332 | add y0, y2 # y2 = S1 + CH | ||
333 | ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | ||
334 | offset = \round * 4 + _XFER | ||
335 | add offset(%rsp), y2 # y2 = k + w + S1 + CH | ||
336 | mov a, y0 # y0 = a | ||
337 | add y2, h # h = h + S1 + CH + k + w | ||
338 | mov a, y2 # y2 = a | ||
339 | or c, y0 # y0 = a|c | ||
340 | add h, d # d = d + h + S1 + CH + k + w | ||
341 | and c, y2 # y2 = a&c | ||
342 | and b, y0 # y0 = (a|c)&b | ||
343 | add y1, h # h = h + S1 + CH + k + w + S0 | ||
344 | or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) | ||
345 | add y0, h # h = h + S1 + CH + k + w + S0 + MAJ | ||
346 | ROTATE_ARGS | ||
347 | .endm | ||
348 | |||
349 | ######################################################################## | ||
350 | ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) | ||
351 | ## arg 1 : pointer to input data | ||
352 | ## arg 2 : pointer to digest | ||
353 | ## arg 3 : Num blocks | ||
354 | ######################################################################## | ||
355 | .text | ||
356 | ENTRY(sha256_transform_ssse3) | ||
357 | .align 32 | ||
358 | pushq %rbx | ||
359 | pushq %rbp | ||
360 | pushq %r13 | ||
361 | pushq %r14 | ||
362 | pushq %r15 | ||
363 | pushq %r12 | ||
364 | |||
365 | mov %rsp, %r12 | ||
366 | subq $STACK_SIZE, %rsp | ||
367 | and $~15, %rsp | ||
368 | |||
369 | shl $6, NUM_BLKS # convert to bytes | ||
370 | jz done_hash | ||
371 | add INP, NUM_BLKS | ||
372 | mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data | ||
373 | |||
374 | ## load initial digest | ||
375 | mov 4*0(CTX), a | ||
376 | mov 4*1(CTX), b | ||
377 | mov 4*2(CTX), c | ||
378 | mov 4*3(CTX), d | ||
379 | mov 4*4(CTX), e | ||
380 | mov 4*5(CTX), f | ||
381 | mov 4*6(CTX), g | ||
382 | mov 4*7(CTX), h | ||
383 | |||
384 | movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | ||
385 | movdqa _SHUF_00BA(%rip), SHUF_00BA | ||
386 | movdqa _SHUF_DC00(%rip), SHUF_DC00 | ||
387 | |||
388 | loop0: | ||
389 | lea K256(%rip), TBL | ||
390 | |||
391 | ## byte swap first 16 dwords | ||
392 | COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK | ||
393 | COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK | ||
394 | COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK | ||
395 | COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK | ||
396 | |||
397 | mov INP, _INP(%rsp) | ||
398 | |||
399 | ## schedule 48 input dwords, by doing 3 rounds of 16 each | ||
400 | mov $3, SRND | ||
401 | .align 16 | ||
402 | loop1: | ||
403 | movdqa (TBL), XFER | ||
404 | paddd X0, XFER | ||
405 | movdqa XFER, _XFER(%rsp) | ||
406 | FOUR_ROUNDS_AND_SCHED | ||
407 | |||
408 | movdqa 1*16(TBL), XFER | ||
409 | paddd X0, XFER | ||
410 | movdqa XFER, _XFER(%rsp) | ||
411 | FOUR_ROUNDS_AND_SCHED | ||
412 | |||
413 | movdqa 2*16(TBL), XFER | ||
414 | paddd X0, XFER | ||
415 | movdqa XFER, _XFER(%rsp) | ||
416 | FOUR_ROUNDS_AND_SCHED | ||
417 | |||
418 | movdqa 3*16(TBL), XFER | ||
419 | paddd X0, XFER | ||
420 | movdqa XFER, _XFER(%rsp) | ||
421 | add $4*16, TBL | ||
422 | FOUR_ROUNDS_AND_SCHED | ||
423 | |||
424 | sub $1, SRND | ||
425 | jne loop1 | ||
426 | |||
427 | mov $2, SRND | ||
428 | loop2: | ||
429 | paddd (TBL), X0 | ||
430 | movdqa X0, _XFER(%rsp) | ||
431 | DO_ROUND 0 | ||
432 | DO_ROUND 1 | ||
433 | DO_ROUND 2 | ||
434 | DO_ROUND 3 | ||
435 | paddd 1*16(TBL), X1 | ||
436 | movdqa X1, _XFER(%rsp) | ||
437 | add $2*16, TBL | ||
438 | DO_ROUND 0 | ||
439 | DO_ROUND 1 | ||
440 | DO_ROUND 2 | ||
441 | DO_ROUND 3 | ||
442 | |||
443 | movdqa X2, X0 | ||
444 | movdqa X3, X1 | ||
445 | |||
446 | sub $1, SRND | ||
447 | jne loop2 | ||
448 | |||
449 | addm (4*0)(CTX),a | ||
450 | addm (4*1)(CTX),b | ||
451 | addm (4*2)(CTX),c | ||
452 | addm (4*3)(CTX),d | ||
453 | addm (4*4)(CTX),e | ||
454 | addm (4*5)(CTX),f | ||
455 | addm (4*6)(CTX),g | ||
456 | addm (4*7)(CTX),h | ||
457 | |||
458 | mov _INP(%rsp), INP | ||
459 | add $64, INP | ||
460 | cmp _INP_END(%rsp), INP | ||
461 | jne loop0 | ||
462 | |||
463 | done_hash: | ||
464 | |||
465 | mov %r12, %rsp | ||
466 | |||
467 | popq %r12 | ||
468 | popq %r15 | ||
469 | popq %r14 | ||
470 | popq %r13 | ||
471 | popq %rbp | ||
472 | popq %rbx | ||
473 | |||
474 | ret | ||
475 | ENDPROC(sha256_transform_ssse3) | ||
476 | |||
477 | .data | ||
478 | .align 64 | ||
479 | K256: | ||
480 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
481 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
482 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
483 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
484 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
485 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
486 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
487 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
488 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
489 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
490 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
491 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
492 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
493 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
494 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
495 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
496 | |||
497 | PSHUFFLE_BYTE_FLIP_MASK: | ||
498 | .octa 0x0c0d0e0f08090a0b0405060700010203 | ||
499 | |||
500 | # shuffle xBxA -> 00BA | ||
501 | _SHUF_00BA: | ||
502 | .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 | ||
503 | |||
504 | # shuffle xDxC -> DC00 | ||
505 | _SHUF_DC00: | ||
506 | .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF | ||
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c new file mode 100644 index 000000000000..597d4da69656 --- /dev/null +++ b/arch/x86/crypto/sha256_ssse3_glue.c | |||
@@ -0,0 +1,275 @@ | |||
1 | /* | ||
2 | * Cryptographic API. | ||
3 | * | ||
4 | * Glue code for the SHA256 Secure Hash Algorithm assembler | ||
5 | * implementation using supplemental SSE3 / AVX / AVX2 instructions. | ||
6 | * | ||
7 | * This file is based on sha256_generic.c | ||
8 | * | ||
9 | * Copyright (C) 2013 Intel Corporation. | ||
10 | * | ||
11 | * Author: | ||
12 | * Tim Chen <tim.c.chen@linux.intel.com> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify it | ||
15 | * under the terms of the GNU General Public License as published by the Free | ||
16 | * Software Foundation; either version 2 of the License, or (at your option) | ||
17 | * any later version. | ||
18 | * | ||
19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
20 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
22 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
23 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
24 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
25 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
26 | * SOFTWARE. | ||
27 | */ | ||
28 | |||
29 | |||
30 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
31 | |||
32 | #include <crypto/internal/hash.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/module.h> | ||
35 | #include <linux/mm.h> | ||
36 | #include <linux/cryptohash.h> | ||
37 | #include <linux/types.h> | ||
38 | #include <crypto/sha.h> | ||
39 | #include <asm/byteorder.h> | ||
40 | #include <asm/i387.h> | ||
41 | #include <asm/xcr.h> | ||
42 | #include <asm/xsave.h> | ||
43 | #include <linux/string.h> | ||
44 | |||
45 | asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, | ||
46 | u64 rounds); | ||
47 | #ifdef CONFIG_AS_AVX | ||
48 | asmlinkage void sha256_transform_avx(const char *data, u32 *digest, | ||
49 | u64 rounds); | ||
50 | #endif | ||
51 | #ifdef CONFIG_AS_AVX2 | ||
52 | asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, | ||
53 | u64 rounds); | ||
54 | #endif | ||
55 | |||
56 | static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); | ||
57 | |||
58 | |||
59 | static int sha256_ssse3_init(struct shash_desc *desc) | ||
60 | { | ||
61 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
62 | |||
63 | sctx->state[0] = SHA256_H0; | ||
64 | sctx->state[1] = SHA256_H1; | ||
65 | sctx->state[2] = SHA256_H2; | ||
66 | sctx->state[3] = SHA256_H3; | ||
67 | sctx->state[4] = SHA256_H4; | ||
68 | sctx->state[5] = SHA256_H5; | ||
69 | sctx->state[6] = SHA256_H6; | ||
70 | sctx->state[7] = SHA256_H7; | ||
71 | sctx->count = 0; | ||
72 | |||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
77 | unsigned int len, unsigned int partial) | ||
78 | { | ||
79 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
80 | unsigned int done = 0; | ||
81 | |||
82 | sctx->count += len; | ||
83 | |||
84 | if (partial) { | ||
85 | done = SHA256_BLOCK_SIZE - partial; | ||
86 | memcpy(sctx->buf + partial, data, done); | ||
87 | sha256_transform_asm(sctx->buf, sctx->state, 1); | ||
88 | } | ||
89 | |||
90 | if (len - done >= SHA256_BLOCK_SIZE) { | ||
91 | const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; | ||
92 | |||
93 | sha256_transform_asm(data + done, sctx->state, (u64) rounds); | ||
94 | |||
95 | done += rounds * SHA256_BLOCK_SIZE; | ||
96 | } | ||
97 | |||
98 | memcpy(sctx->buf, data + done, len - done); | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
104 | unsigned int len) | ||
105 | { | ||
106 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
107 | unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; | ||
108 | int res; | ||
109 | |||
110 | /* Handle the fast case right here */ | ||
111 | if (partial + len < SHA256_BLOCK_SIZE) { | ||
112 | sctx->count += len; | ||
113 | memcpy(sctx->buf + partial, data, len); | ||
114 | |||
115 | return 0; | ||
116 | } | ||
117 | |||
118 | if (!irq_fpu_usable()) { | ||
119 | res = crypto_sha256_update(desc, data, len); | ||
120 | } else { | ||
121 | kernel_fpu_begin(); | ||
122 | res = __sha256_ssse3_update(desc, data, len, partial); | ||
123 | kernel_fpu_end(); | ||
124 | } | ||
125 | |||
126 | return res; | ||
127 | } | ||
128 | |||
129 | |||
130 | /* Add padding and return the message digest. */ | ||
131 | static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) | ||
132 | { | ||
133 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
134 | unsigned int i, index, padlen; | ||
135 | __be32 *dst = (__be32 *)out; | ||
136 | __be64 bits; | ||
137 | static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; | ||
138 | |||
139 | bits = cpu_to_be64(sctx->count << 3); | ||
140 | |||
141 | /* Pad out to 56 mod 64 and append length */ | ||
142 | index = sctx->count % SHA256_BLOCK_SIZE; | ||
143 | padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); | ||
144 | |||
145 | if (!irq_fpu_usable()) { | ||
146 | crypto_sha256_update(desc, padding, padlen); | ||
147 | crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
148 | } else { | ||
149 | kernel_fpu_begin(); | ||
150 | /* We need to fill a whole block for __sha256_ssse3_update() */ | ||
151 | if (padlen <= 56) { | ||
152 | sctx->count += padlen; | ||
153 | memcpy(sctx->buf + index, padding, padlen); | ||
154 | } else { | ||
155 | __sha256_ssse3_update(desc, padding, padlen, index); | ||
156 | } | ||
157 | __sha256_ssse3_update(desc, (const u8 *)&bits, | ||
158 | sizeof(bits), 56); | ||
159 | kernel_fpu_end(); | ||
160 | } | ||
161 | |||
162 | /* Store state in digest */ | ||
163 | for (i = 0; i < 8; i++) | ||
164 | dst[i] = cpu_to_be32(sctx->state[i]); | ||
165 | |||
166 | /* Wipe context */ | ||
167 | memset(sctx, 0, sizeof(*sctx)); | ||
168 | |||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | static int sha256_ssse3_export(struct shash_desc *desc, void *out) | ||
173 | { | ||
174 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
175 | |||
176 | memcpy(out, sctx, sizeof(*sctx)); | ||
177 | |||
178 | return 0; | ||
179 | } | ||
180 | |||
181 | static int sha256_ssse3_import(struct shash_desc *desc, const void *in) | ||
182 | { | ||
183 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
184 | |||
185 | memcpy(sctx, in, sizeof(*sctx)); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static struct shash_alg alg = { | ||
191 | .digestsize = SHA256_DIGEST_SIZE, | ||
192 | .init = sha256_ssse3_init, | ||
193 | .update = sha256_ssse3_update, | ||
194 | .final = sha256_ssse3_final, | ||
195 | .export = sha256_ssse3_export, | ||
196 | .import = sha256_ssse3_import, | ||
197 | .descsize = sizeof(struct sha256_state), | ||
198 | .statesize = sizeof(struct sha256_state), | ||
199 | .base = { | ||
200 | .cra_name = "sha256", | ||
201 | .cra_driver_name = "sha256-ssse3", | ||
202 | .cra_priority = 150, | ||
203 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
204 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
205 | .cra_module = THIS_MODULE, | ||
206 | } | ||
207 | }; | ||
208 | |||
209 | #ifdef CONFIG_AS_AVX | ||
210 | static bool __init avx_usable(void) | ||
211 | { | ||
212 | u64 xcr0; | ||
213 | |||
214 | if (!cpu_has_avx || !cpu_has_osxsave) | ||
215 | return false; | ||
216 | |||
217 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
218 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
219 | pr_info("AVX detected but unusable.\n"); | ||
220 | |||
221 | return false; | ||
222 | } | ||
223 | |||
224 | return true; | ||
225 | } | ||
226 | #endif | ||
227 | |||
228 | static int __init sha256_ssse3_mod_init(void) | ||
229 | { | ||
230 | /* test for SSE3 first */ | ||
231 | if (cpu_has_ssse3) | ||
232 | sha256_transform_asm = sha256_transform_ssse3; | ||
233 | |||
234 | #ifdef CONFIG_AS_AVX | ||
235 | /* allow AVX to override SSSE3, it's a little faster */ | ||
236 | if (avx_usable()) { | ||
237 | #ifdef CONFIG_AS_AVX2 | ||
238 | if (boot_cpu_has(X86_FEATURE_AVX2)) | ||
239 | sha256_transform_asm = sha256_transform_rorx; | ||
240 | else | ||
241 | #endif | ||
242 | sha256_transform_asm = sha256_transform_avx; | ||
243 | } | ||
244 | #endif | ||
245 | |||
246 | if (sha256_transform_asm) { | ||
247 | #ifdef CONFIG_AS_AVX | ||
248 | if (sha256_transform_asm == sha256_transform_avx) | ||
249 | pr_info("Using AVX optimized SHA-256 implementation\n"); | ||
250 | #ifdef CONFIG_AS_AVX2 | ||
251 | else if (sha256_transform_asm == sha256_transform_rorx) | ||
252 | pr_info("Using AVX2 optimized SHA-256 implementation\n"); | ||
253 | #endif | ||
254 | else | ||
255 | #endif | ||
256 | pr_info("Using SSSE3 optimized SHA-256 implementation\n"); | ||
257 | return crypto_register_shash(&alg); | ||
258 | } | ||
259 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); | ||
260 | |||
261 | return -ENODEV; | ||
262 | } | ||
263 | |||
264 | static void __exit sha256_ssse3_mod_fini(void) | ||
265 | { | ||
266 | crypto_unregister_shash(&alg); | ||
267 | } | ||
268 | |||
269 | module_init(sha256_ssse3_mod_init); | ||
270 | module_exit(sha256_ssse3_mod_fini); | ||
271 | |||
272 | MODULE_LICENSE("GPL"); | ||
273 | MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); | ||
274 | |||
275 | MODULE_ALIAS("sha256"); | ||
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S new file mode 100644 index 000000000000..974dde9bc6cd --- /dev/null +++ b/arch/x86/crypto/sha512-avx-asm.S | |||
@@ -0,0 +1,423 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-512 with AVX instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # David Cote <david.m.cote@intel.com> | ||
10 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
11 | # | ||
12 | # This software is available to you under a choice of one of two | ||
13 | # licenses. You may choose to be licensed under the terms of the GNU | ||
14 | # General Public License (GPL) Version 2, available from the file | ||
15 | # COPYING in the main directory of this source tree, or the | ||
16 | # OpenIB.org BSD license below: | ||
17 | # | ||
18 | # Redistribution and use in source and binary forms, with or | ||
19 | # without modification, are permitted provided that the following | ||
20 | # conditions are met: | ||
21 | # | ||
22 | # - Redistributions of source code must retain the above | ||
23 | # copyright notice, this list of conditions and the following | ||
24 | # disclaimer. | ||
25 | # | ||
26 | # - Redistributions in binary form must reproduce the above | ||
27 | # copyright notice, this list of conditions and the following | ||
28 | # disclaimer in the documentation and/or other materials | ||
29 | # provided with the distribution. | ||
30 | # | ||
31 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
32 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
33 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
34 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
35 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
36 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
37 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
38 | # SOFTWARE. | ||
39 | # | ||
40 | ######################################################################## | ||
41 | # | ||
42 | # This code is described in an Intel White-Paper: | ||
43 | # "Fast SHA-512 Implementations on Intel Architecture Processors" | ||
44 | # | ||
45 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
46 | # and search for that title. | ||
47 | # | ||
48 | ######################################################################## | ||
49 | |||
50 | #ifdef CONFIG_AS_AVX | ||
51 | #include <linux/linkage.h> | ||
52 | |||
53 | .text | ||
54 | |||
55 | # Virtual Registers | ||
56 | # ARG1 | ||
57 | msg = %rdi | ||
58 | # ARG2 | ||
59 | digest = %rsi | ||
60 | # ARG3 | ||
61 | msglen = %rdx | ||
62 | T1 = %rcx | ||
63 | T2 = %r8 | ||
64 | a_64 = %r9 | ||
65 | b_64 = %r10 | ||
66 | c_64 = %r11 | ||
67 | d_64 = %r12 | ||
68 | e_64 = %r13 | ||
69 | f_64 = %r14 | ||
70 | g_64 = %r15 | ||
71 | h_64 = %rbx | ||
72 | tmp0 = %rax | ||
73 | |||
74 | # Local variables (stack frame) | ||
75 | |||
76 | # Message Schedule | ||
77 | W_SIZE = 80*8 | ||
78 | # W[t] + K[t] | W[t+1] + K[t+1] | ||
79 | WK_SIZE = 2*8 | ||
80 | RSPSAVE_SIZE = 1*8 | ||
81 | GPRSAVE_SIZE = 5*8 | ||
82 | |||
83 | frame_W = 0 | ||
84 | frame_WK = frame_W + W_SIZE | ||
85 | frame_RSPSAVE = frame_WK + WK_SIZE | ||
86 | frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE | ||
87 | frame_size = frame_GPRSAVE + GPRSAVE_SIZE | ||
88 | |||
89 | # Useful QWORD "arrays" for simpler memory references | ||
90 | # MSG, DIGEST, K_t, W_t are arrays | ||
91 | # WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even | ||
92 | |||
93 | # Input message (arg1) | ||
94 | #define MSG(i) 8*i(msg) | ||
95 | |||
96 | # Output Digest (arg2) | ||
97 | #define DIGEST(i) 8*i(digest) | ||
98 | |||
99 | # SHA Constants (static mem) | ||
100 | #define K_t(i) 8*i+K512(%rip) | ||
101 | |||
102 | # Message Schedule (stack frame) | ||
103 | #define W_t(i) 8*i+frame_W(%rsp) | ||
104 | |||
105 | # W[t]+K[t] (stack frame) | ||
106 | #define WK_2(i) 8*((i%2))+frame_WK(%rsp) | ||
107 | |||
108 | .macro RotateState | ||
109 | # Rotate symbols a..h right | ||
110 | TMP = h_64 | ||
111 | h_64 = g_64 | ||
112 | g_64 = f_64 | ||
113 | f_64 = e_64 | ||
114 | e_64 = d_64 | ||
115 | d_64 = c_64 | ||
116 | c_64 = b_64 | ||
117 | b_64 = a_64 | ||
118 | a_64 = TMP | ||
119 | .endm | ||
120 | |||
121 | .macro RORQ p1 p2 | ||
122 | # shld is faster than ror on Sandybridge | ||
123 | shld $(64-\p2), \p1, \p1 | ||
124 | .endm | ||
125 | |||
126 | .macro SHA512_Round rnd | ||
127 | # Compute Round %%t | ||
128 | mov f_64, T1 # T1 = f | ||
129 | mov e_64, tmp0 # tmp = e | ||
130 | xor g_64, T1 # T1 = f ^ g | ||
131 | RORQ tmp0, 23 # 41 # tmp = e ror 23 | ||
132 | and e_64, T1 # T1 = (f ^ g) & e | ||
133 | xor e_64, tmp0 # tmp = (e ror 23) ^ e | ||
134 | xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) | ||
135 | idx = \rnd | ||
136 | add WK_2(idx), T1 # W[t] + K[t] from message scheduler | ||
137 | RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 | ||
138 | xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e | ||
139 | mov a_64, T2 # T2 = a | ||
140 | add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h | ||
141 | RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) | ||
142 | add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) | ||
143 | mov a_64, tmp0 # tmp = a | ||
144 | xor c_64, T2 # T2 = a ^ c | ||
145 | and c_64, tmp0 # tmp = a & c | ||
146 | and b_64, T2 # T2 = (a ^ c) & b | ||
147 | xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) | ||
148 | mov a_64, tmp0 # tmp = a | ||
149 | RORQ tmp0, 5 # 39 # tmp = a ror 5 | ||
150 | xor a_64, tmp0 # tmp = (a ror 5) ^ a | ||
151 | add T1, d_64 # e(next_state) = d + T1 | ||
152 | RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 | ||
153 | xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a | ||
154 | lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) | ||
155 | RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) | ||
156 | add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) | ||
157 | RotateState | ||
158 | .endm | ||
159 | |||
160 | .macro SHA512_2Sched_2Round_avx rnd | ||
161 | # Compute rounds t-2 and t-1 | ||
162 | # Compute message schedule QWORDS t and t+1 | ||
163 | |||
164 | # Two rounds are computed based on the values for K[t-2]+W[t-2] and | ||
165 | # K[t-1]+W[t-1] which were previously stored at WK_2 by the message | ||
166 | # scheduler. | ||
167 | # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. | ||
168 | # They are then added to their respective SHA512 constants at | ||
169 | # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] | ||
170 | # For brievity, the comments following vectored instructions only refer to | ||
171 | # the first of a pair of QWORDS. | ||
172 | # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} | ||
173 | # The computation of the message schedule and the rounds are tightly | ||
174 | # stitched to take advantage of instruction-level parallelism. | ||
175 | |||
176 | idx = \rnd - 2 | ||
177 | vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] | ||
178 | idx = \rnd - 15 | ||
179 | vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] | ||
180 | mov f_64, T1 | ||
181 | vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 | ||
182 | mov e_64, tmp0 | ||
183 | vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 | ||
184 | xor g_64, T1 | ||
185 | RORQ tmp0, 23 # 41 | ||
186 | vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 | ||
187 | and e_64, T1 | ||
188 | xor e_64, tmp0 | ||
189 | vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 | ||
190 | xor g_64, T1 | ||
191 | idx = \rnd | ||
192 | add WK_2(idx), T1# | ||
193 | vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 | ||
194 | RORQ tmp0, 4 # 18 | ||
195 | vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 | ||
196 | xor e_64, tmp0 | ||
197 | mov a_64, T2 | ||
198 | add h_64, T1 | ||
199 | vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 | ||
200 | RORQ tmp0, 14 # 14 | ||
201 | add tmp0, T1 | ||
202 | vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 | ||
203 | mov a_64, tmp0 | ||
204 | xor c_64, T2 | ||
205 | vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 | ||
206 | and c_64, tmp0 | ||
207 | and b_64, T2 | ||
208 | vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 | ||
209 | xor tmp0, T2 | ||
210 | mov a_64, tmp0 | ||
211 | vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 | ||
212 | RORQ tmp0, 5 # 39 | ||
213 | vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 | ||
214 | xor a_64, tmp0 | ||
215 | add T1, d_64 | ||
216 | RORQ tmp0, 6 # 34 | ||
217 | xor a_64, tmp0 | ||
218 | vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ | ||
219 | # W[t-15]>>7 ^ W[t-15]<<63 | ||
220 | lea (T1, T2), h_64 | ||
221 | RORQ tmp0, 28 # 28 | ||
222 | vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 | ||
223 | add tmp0, h_64 | ||
224 | RotateState | ||
225 | vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ | ||
226 | # W[t-2]<<25 | ||
227 | mov f_64, T1 | ||
228 | vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) | ||
229 | mov e_64, tmp0 | ||
230 | xor g_64, T1 | ||
231 | idx = \rnd - 16 | ||
232 | vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] | ||
233 | idx = \rnd - 7 | ||
234 | vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] | ||
235 | RORQ tmp0, 23 # 41 | ||
236 | and e_64, T1 | ||
237 | xor e_64, tmp0 | ||
238 | xor g_64, T1 | ||
239 | vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 | ||
240 | idx = \rnd + 1 | ||
241 | add WK_2(idx), T1 | ||
242 | vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) | ||
243 | RORQ tmp0, 4 # 18 | ||
244 | vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) | ||
245 | xor e_64, tmp0 | ||
246 | vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + | ||
247 | # s0(W[t-15]) + W[t-16] | ||
248 | mov a_64, T2 | ||
249 | add h_64, T1 | ||
250 | RORQ tmp0, 14 # 14 | ||
251 | add tmp0, T1 | ||
252 | idx = \rnd | ||
253 | vmovdqa %xmm0, W_t(idx) # Store W[t] | ||
254 | vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] | ||
255 | vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds | ||
256 | mov a_64, tmp0 | ||
257 | xor c_64, T2 | ||
258 | and c_64, tmp0 | ||
259 | and b_64, T2 | ||
260 | xor tmp0, T2 | ||
261 | mov a_64, tmp0 | ||
262 | RORQ tmp0, 5 # 39 | ||
263 | xor a_64, tmp0 | ||
264 | add T1, d_64 | ||
265 | RORQ tmp0, 6 # 34 | ||
266 | xor a_64, tmp0 | ||
267 | lea (T1, T2), h_64 | ||
268 | RORQ tmp0, 28 # 28 | ||
269 | add tmp0, h_64 | ||
270 | RotateState | ||
271 | .endm | ||
272 | |||
273 | ######################################################################## | ||
274 | # void sha512_transform_avx(const void* M, void* D, u64 L) | ||
275 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | ||
276 | # The size of the message pointed to by M must be an integer multiple of SHA512 | ||
277 | # message blocks. | ||
278 | # L is the message length in SHA512 blocks | ||
279 | ######################################################################## | ||
280 | ENTRY(sha512_transform_avx) | ||
281 | cmp $0, msglen | ||
282 | je nowork | ||
283 | |||
284 | # Allocate Stack Space | ||
285 | mov %rsp, %rax | ||
286 | sub $frame_size, %rsp | ||
287 | and $~(0x20 - 1), %rsp | ||
288 | mov %rax, frame_RSPSAVE(%rsp) | ||
289 | |||
290 | # Save GPRs | ||
291 | mov %rbx, frame_GPRSAVE(%rsp) | ||
292 | mov %r12, frame_GPRSAVE +8*1(%rsp) | ||
293 | mov %r13, frame_GPRSAVE +8*2(%rsp) | ||
294 | mov %r14, frame_GPRSAVE +8*3(%rsp) | ||
295 | mov %r15, frame_GPRSAVE +8*4(%rsp) | ||
296 | |||
297 | updateblock: | ||
298 | |||
299 | # Load state variables | ||
300 | mov DIGEST(0), a_64 | ||
301 | mov DIGEST(1), b_64 | ||
302 | mov DIGEST(2), c_64 | ||
303 | mov DIGEST(3), d_64 | ||
304 | mov DIGEST(4), e_64 | ||
305 | mov DIGEST(5), f_64 | ||
306 | mov DIGEST(6), g_64 | ||
307 | mov DIGEST(7), h_64 | ||
308 | |||
309 | t = 0 | ||
310 | .rept 80/2 + 1 | ||
311 | # (80 rounds) / (2 rounds/iteration) + (1 iteration) | ||
312 | # +1 iteration because the scheduler leads hashing by 1 iteration | ||
313 | .if t < 2 | ||
314 | # BSWAP 2 QWORDS | ||
315 | vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 | ||
316 | vmovdqu MSG(t), %xmm0 | ||
317 | vpshufb %xmm1, %xmm0, %xmm0 # BSWAP | ||
318 | vmovdqa %xmm0, W_t(t) # Store Scheduled Pair | ||
319 | vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] | ||
320 | vmovdqa %xmm0, WK_2(t) # Store into WK for rounds | ||
321 | .elseif t < 16 | ||
322 | # BSWAP 2 QWORDS# Compute 2 Rounds | ||
323 | vmovdqu MSG(t), %xmm0 | ||
324 | vpshufb %xmm1, %xmm0, %xmm0 # BSWAP | ||
325 | SHA512_Round t-2 # Round t-2 | ||
326 | vmovdqa %xmm0, W_t(t) # Store Scheduled Pair | ||
327 | vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] | ||
328 | SHA512_Round t-1 # Round t-1 | ||
329 | vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK | ||
330 | .elseif t < 79 | ||
331 | # Schedule 2 QWORDS# Compute 2 Rounds | ||
332 | SHA512_2Sched_2Round_avx t | ||
333 | .else | ||
334 | # Compute 2 Rounds | ||
335 | SHA512_Round t-2 | ||
336 | SHA512_Round t-1 | ||
337 | .endif | ||
338 | t = t+2 | ||
339 | .endr | ||
340 | |||
341 | # Update digest | ||
342 | add a_64, DIGEST(0) | ||
343 | add b_64, DIGEST(1) | ||
344 | add c_64, DIGEST(2) | ||
345 | add d_64, DIGEST(3) | ||
346 | add e_64, DIGEST(4) | ||
347 | add f_64, DIGEST(5) | ||
348 | add g_64, DIGEST(6) | ||
349 | add h_64, DIGEST(7) | ||
350 | |||
351 | # Advance to next message block | ||
352 | add $16*8, msg | ||
353 | dec msglen | ||
354 | jnz updateblock | ||
355 | |||
356 | # Restore GPRs | ||
357 | mov frame_GPRSAVE(%rsp), %rbx | ||
358 | mov frame_GPRSAVE +8*1(%rsp), %r12 | ||
359 | mov frame_GPRSAVE +8*2(%rsp), %r13 | ||
360 | mov frame_GPRSAVE +8*3(%rsp), %r14 | ||
361 | mov frame_GPRSAVE +8*4(%rsp), %r15 | ||
362 | |||
363 | # Restore Stack Pointer | ||
364 | mov frame_RSPSAVE(%rsp), %rsp | ||
365 | |||
366 | nowork: | ||
367 | ret | ||
368 | ENDPROC(sha512_transform_avx) | ||
369 | |||
370 | ######################################################################## | ||
371 | ### Binary Data | ||
372 | |||
373 | .data | ||
374 | |||
375 | .align 16 | ||
376 | |||
377 | # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. | ||
378 | XMM_QWORD_BSWAP: | ||
379 | .octa 0x08090a0b0c0d0e0f0001020304050607 | ||
380 | |||
381 | # K[t] used in SHA512 hashing | ||
382 | K512: | ||
383 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd | ||
384 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc | ||
385 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 | ||
386 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 | ||
387 | .quad 0xd807aa98a3030242,0x12835b0145706fbe | ||
388 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 | ||
389 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 | ||
390 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 | ||
391 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 | ||
392 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 | ||
393 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 | ||
394 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 | ||
395 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 | ||
396 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 | ||
397 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 | ||
398 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 | ||
399 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 | ||
400 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df | ||
401 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 | ||
402 | .quad 0x81c2c92e47edaee6,0x92722c851482353b | ||
403 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 | ||
404 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 | ||
405 | .quad 0xd192e819d6ef5218,0xd69906245565a910 | ||
406 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 | ||
407 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 | ||
408 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 | ||
409 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb | ||
410 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 | ||
411 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 | ||
412 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec | ||
413 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 | ||
414 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b | ||
415 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 | ||
416 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | ||
417 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 | ||
418 | .quad 0x113f9804bef90dae,0x1b710b35131c471b | ||
419 | .quad 0x28db77f523047d84,0x32caab7b40c72493 | ||
420 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | ||
421 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | ||
422 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | ||
423 | #endif | ||
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S new file mode 100644 index 000000000000..568b96105f5c --- /dev/null +++ b/arch/x86/crypto/sha512-avx2-asm.S | |||
@@ -0,0 +1,743 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-512 with AVX2 instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # David Cote <david.m.cote@intel.com> | ||
10 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
11 | # | ||
12 | # This software is available to you under a choice of one of two | ||
13 | # licenses. You may choose to be licensed under the terms of the GNU | ||
14 | # General Public License (GPL) Version 2, available from the file | ||
15 | # COPYING in the main directory of this source tree, or the | ||
16 | # OpenIB.org BSD license below: | ||
17 | # | ||
18 | # Redistribution and use in source and binary forms, with or | ||
19 | # without modification, are permitted provided that the following | ||
20 | # conditions are met: | ||
21 | # | ||
22 | # - Redistributions of source code must retain the above | ||
23 | # copyright notice, this list of conditions and the following | ||
24 | # disclaimer. | ||
25 | # | ||
26 | # - Redistributions in binary form must reproduce the above | ||
27 | # copyright notice, this list of conditions and the following | ||
28 | # disclaimer in the documentation and/or other materials | ||
29 | # provided with the distribution. | ||
30 | # | ||
31 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
32 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
33 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
34 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
35 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
36 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
37 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
38 | # SOFTWARE. | ||
39 | # | ||
40 | ######################################################################## | ||
41 | # | ||
42 | # This code is described in an Intel White-Paper: | ||
43 | # "Fast SHA-512 Implementations on Intel Architecture Processors" | ||
44 | # | ||
45 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
46 | # and search for that title. | ||
47 | # | ||
48 | ######################################################################## | ||
49 | # This code schedules 1 blocks at a time, with 4 lanes per block | ||
50 | ######################################################################## | ||
51 | |||
52 | #ifdef CONFIG_AS_AVX2 | ||
53 | #include <linux/linkage.h> | ||
54 | |||
55 | .text | ||
56 | |||
57 | # Virtual Registers | ||
58 | Y_0 = %ymm4 | ||
59 | Y_1 = %ymm5 | ||
60 | Y_2 = %ymm6 | ||
61 | Y_3 = %ymm7 | ||
62 | |||
63 | YTMP0 = %ymm0 | ||
64 | YTMP1 = %ymm1 | ||
65 | YTMP2 = %ymm2 | ||
66 | YTMP3 = %ymm3 | ||
67 | YTMP4 = %ymm8 | ||
68 | XFER = YTMP0 | ||
69 | |||
70 | BYTE_FLIP_MASK = %ymm9 | ||
71 | |||
72 | # 1st arg | ||
73 | INP = %rdi | ||
74 | # 2nd arg | ||
75 | CTX = %rsi | ||
76 | # 3rd arg | ||
77 | NUM_BLKS = %rdx | ||
78 | |||
79 | c = %rcx | ||
80 | d = %r8 | ||
81 | e = %rdx | ||
82 | y3 = %rdi | ||
83 | |||
84 | TBL = %rbp | ||
85 | |||
86 | a = %rax | ||
87 | b = %rbx | ||
88 | |||
89 | f = %r9 | ||
90 | g = %r10 | ||
91 | h = %r11 | ||
92 | old_h = %r11 | ||
93 | |||
94 | T1 = %r12 | ||
95 | y0 = %r13 | ||
96 | y1 = %r14 | ||
97 | y2 = %r15 | ||
98 | |||
99 | y4 = %r12 | ||
100 | |||
101 | # Local variables (stack frame) | ||
102 | XFER_SIZE = 4*8 | ||
103 | SRND_SIZE = 1*8 | ||
104 | INP_SIZE = 1*8 | ||
105 | INPEND_SIZE = 1*8 | ||
106 | RSPSAVE_SIZE = 1*8 | ||
107 | GPRSAVE_SIZE = 6*8 | ||
108 | |||
109 | frame_XFER = 0 | ||
110 | frame_SRND = frame_XFER + XFER_SIZE | ||
111 | frame_INP = frame_SRND + SRND_SIZE | ||
112 | frame_INPEND = frame_INP + INP_SIZE | ||
113 | frame_RSPSAVE = frame_INPEND + INPEND_SIZE | ||
114 | frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE | ||
115 | frame_size = frame_GPRSAVE + GPRSAVE_SIZE | ||
116 | |||
117 | ## assume buffers not aligned | ||
118 | #define VMOVDQ vmovdqu | ||
119 | |||
120 | # addm [mem], reg | ||
121 | # Add reg to mem using reg-mem add and store | ||
122 | .macro addm p1 p2 | ||
123 | add \p1, \p2 | ||
124 | mov \p2, \p1 | ||
125 | .endm | ||
126 | |||
127 | |||
128 | # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask | ||
129 | # Load ymm with mem and byte swap each dword | ||
130 | .macro COPY_YMM_AND_BSWAP p1 p2 p3 | ||
131 | VMOVDQ \p2, \p1 | ||
132 | vpshufb \p3, \p1, \p1 | ||
133 | .endm | ||
134 | # rotate_Ys | ||
135 | # Rotate values of symbols Y0...Y3 | ||
136 | .macro rotate_Ys | ||
137 | Y_ = Y_0 | ||
138 | Y_0 = Y_1 | ||
139 | Y_1 = Y_2 | ||
140 | Y_2 = Y_3 | ||
141 | Y_3 = Y_ | ||
142 | .endm | ||
143 | |||
144 | # RotateState | ||
145 | .macro RotateState | ||
146 | # Rotate symbols a..h right | ||
147 | old_h = h | ||
148 | TMP_ = h | ||
149 | h = g | ||
150 | g = f | ||
151 | f = e | ||
152 | e = d | ||
153 | d = c | ||
154 | c = b | ||
155 | b = a | ||
156 | a = TMP_ | ||
157 | .endm | ||
158 | |||
159 | # macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL | ||
160 | # YDST = {YSRC1, YSRC2} >> RVAL*8 | ||
161 | .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL | ||
162 | vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} | ||
163 | vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 | ||
164 | .endm | ||
165 | |||
166 | .macro FOUR_ROUNDS_AND_SCHED | ||
167 | ################################### RND N + 0 ######################################### | ||
168 | |||
169 | # Extract w[t-7] | ||
170 | MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] | ||
171 | # Calculate w[t-16] + w[t-7] | ||
172 | vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] | ||
173 | # Extract w[t-15] | ||
174 | MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] | ||
175 | |||
176 | # Calculate sigma0 | ||
177 | |||
178 | # Calculate w[t-15] ror 1 | ||
179 | vpsrlq $1, YTMP1, YTMP2 | ||
180 | vpsllq $(64-1), YTMP1, YTMP3 | ||
181 | vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 | ||
182 | # Calculate w[t-15] shr 7 | ||
183 | vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 | ||
184 | |||
185 | mov a, y3 # y3 = a # MAJA | ||
186 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
187 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
188 | add frame_XFER(%rsp),h # h = k + w + h # -- | ||
189 | or c, y3 # y3 = a|c # MAJA | ||
190 | mov f, y2 # y2 = f # CH | ||
191 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
192 | |||
193 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
194 | xor g, y2 # y2 = f^g # CH | ||
195 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
196 | |||
197 | and e, y2 # y2 = (f^g)&e # CH | ||
198 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
199 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
200 | add h, d # d = k + w + h + d # -- | ||
201 | |||
202 | and b, y3 # y3 = (a|c)&b # MAJA | ||
203 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
204 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
205 | |||
206 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
207 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
208 | mov a, T1 # T1 = a # MAJB | ||
209 | and c, T1 # T1 = a&c # MAJB | ||
210 | |||
211 | add y0, y2 # y2 = S1 + CH # -- | ||
212 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
213 | add y1, h # h = k + w + h + S0 # -- | ||
214 | |||
215 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
216 | |||
217 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
218 | add y3, h # h = t1 + S0 + MAJ # -- | ||
219 | |||
220 | RotateState | ||
221 | |||
222 | ################################### RND N + 1 ######################################### | ||
223 | |||
224 | # Calculate w[t-15] ror 8 | ||
225 | vpsrlq $8, YTMP1, YTMP2 | ||
226 | vpsllq $(64-8), YTMP1, YTMP1 | ||
227 | vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 | ||
228 | # XOR the three components | ||
229 | vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 | ||
230 | vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 | ||
231 | |||
232 | |||
233 | # Add three components, w[t-16], w[t-7] and sigma0 | ||
234 | vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 | ||
235 | # Move to appropriate lanes for calculating w[16] and w[17] | ||
236 | vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} | ||
237 | # Move to appropriate lanes for calculating w[18] and w[19] | ||
238 | vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} | ||
239 | |||
240 | # Calculate w[16] and w[17] in both 128 bit lanes | ||
241 | |||
242 | # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes | ||
243 | vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} | ||
244 | vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} | ||
245 | |||
246 | |||
247 | mov a, y3 # y3 = a # MAJA | ||
248 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
249 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
250 | add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- | ||
251 | or c, y3 # y3 = a|c # MAJA | ||
252 | |||
253 | |||
254 | mov f, y2 # y2 = f # CH | ||
255 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
256 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
257 | xor g, y2 # y2 = f^g # CH | ||
258 | |||
259 | |||
260 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
261 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
262 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
263 | and e, y2 # y2 = (f^g)&e # CH | ||
264 | add h, d # d = k + w + h + d # -- | ||
265 | |||
266 | and b, y3 # y3 = (a|c)&b # MAJA | ||
267 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
268 | |||
269 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
270 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
271 | |||
272 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
273 | mov a, T1 # T1 = a # MAJB | ||
274 | and c, T1 # T1 = a&c # MAJB | ||
275 | add y0, y2 # y2 = S1 + CH # -- | ||
276 | |||
277 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
278 | add y1, h # h = k + w + h + S0 # -- | ||
279 | |||
280 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
281 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
282 | add y3, h # h = t1 + S0 + MAJ # -- | ||
283 | |||
284 | RotateState | ||
285 | |||
286 | |||
287 | ################################### RND N + 2 ######################################### | ||
288 | |||
289 | vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} | ||
290 | vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} | ||
291 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} | ||
292 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} | ||
293 | vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} | ||
294 | vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} | ||
295 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} | ||
296 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ | ||
297 | # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} | ||
298 | |||
299 | # Add sigma1 to the other compunents to get w[16] and w[17] | ||
300 | vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} | ||
301 | |||
302 | # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane | ||
303 | vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} | ||
304 | |||
305 | mov a, y3 # y3 = a # MAJA | ||
306 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
307 | add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- | ||
308 | |||
309 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
310 | or c, y3 # y3 = a|c # MAJA | ||
311 | mov f, y2 # y2 = f # CH | ||
312 | xor g, y2 # y2 = f^g # CH | ||
313 | |||
314 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
315 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
316 | and e, y2 # y2 = (f^g)&e # CH | ||
317 | |||
318 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
319 | add h, d # d = k + w + h + d # -- | ||
320 | and b, y3 # y3 = (a|c)&b # MAJA | ||
321 | |||
322 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
323 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
324 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
325 | |||
326 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
327 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
328 | |||
329 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
330 | mov a, T1 # T1 = a # MAJB | ||
331 | and c, T1 # T1 = a&c # MAJB | ||
332 | add y0, y2 # y2 = S1 + CH # -- | ||
333 | |||
334 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
335 | add y1, h # h = k + w + h + S0 # -- | ||
336 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
337 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
338 | |||
339 | add y3, h # h = t1 + S0 + MAJ # -- | ||
340 | |||
341 | RotateState | ||
342 | |||
343 | ################################### RND N + 3 ######################################### | ||
344 | |||
345 | vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} | ||
346 | vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} | ||
347 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} | ||
348 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} | ||
349 | vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} | ||
350 | vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} | ||
351 | vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} | ||
352 | vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ | ||
353 | # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} | ||
354 | |||
355 | # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] | ||
356 | # to newly calculated sigma1 to get w[18] and w[19] | ||
357 | vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} | ||
358 | |||
359 | # Form w[19, w[18], w17], w[16] | ||
360 | vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} | ||
361 | |||
362 | mov a, y3 # y3 = a # MAJA | ||
363 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
364 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
365 | add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- | ||
366 | or c, y3 # y3 = a|c # MAJA | ||
367 | |||
368 | |||
369 | mov f, y2 # y2 = f # CH | ||
370 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
371 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
372 | xor g, y2 # y2 = f^g # CH | ||
373 | |||
374 | |||
375 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
376 | and e, y2 # y2 = (f^g)&e # CH | ||
377 | add h, d # d = k + w + h + d # -- | ||
378 | and b, y3 # y3 = (a|c)&b # MAJA | ||
379 | |||
380 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
381 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
382 | |||
383 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
384 | add y0, y2 # y2 = S1 + CH # -- | ||
385 | |||
386 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
387 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
388 | |||
389 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
390 | |||
391 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
392 | mov a, T1 # T1 = a # MAJB | ||
393 | and c, T1 # T1 = a&c # MAJB | ||
394 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
395 | |||
396 | add y1, h # h = k + w + h + S0 # -- | ||
397 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
398 | add y3, h # h = t1 + S0 + MAJ # -- | ||
399 | |||
400 | RotateState | ||
401 | |||
402 | rotate_Ys | ||
403 | .endm | ||
404 | |||
405 | .macro DO_4ROUNDS | ||
406 | |||
407 | ################################### RND N + 0 ######################################### | ||
408 | |||
409 | mov f, y2 # y2 = f # CH | ||
410 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
411 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
412 | xor g, y2 # y2 = f^g # CH | ||
413 | |||
414 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
415 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
416 | and e, y2 # y2 = (f^g)&e # CH | ||
417 | |||
418 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
419 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
420 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
421 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
422 | mov a, y3 # y3 = a # MAJA | ||
423 | |||
424 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
425 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
426 | add frame_XFER(%rsp), h # h = k + w + h # -- | ||
427 | or c, y3 # y3 = a|c # MAJA | ||
428 | |||
429 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
430 | mov a, T1 # T1 = a # MAJB | ||
431 | and b, y3 # y3 = (a|c)&b # MAJA | ||
432 | and c, T1 # T1 = a&c # MAJB | ||
433 | add y0, y2 # y2 = S1 + CH # -- | ||
434 | |||
435 | add h, d # d = k + w + h + d # -- | ||
436 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
437 | add y1, h # h = k + w + h + S0 # -- | ||
438 | |||
439 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
440 | |||
441 | RotateState | ||
442 | |||
443 | ################################### RND N + 1 ######################################### | ||
444 | |||
445 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
446 | mov f, y2 # y2 = f # CH | ||
447 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
448 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
449 | xor g, y2 # y2 = f^g # CH | ||
450 | |||
451 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
452 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
453 | and e, y2 # y2 = (f^g)&e # CH | ||
454 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
455 | |||
456 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
457 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
458 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
459 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
460 | mov a, y3 # y3 = a # MAJA | ||
461 | |||
462 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
463 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
464 | add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- | ||
465 | or c, y3 # y3 = a|c # MAJA | ||
466 | |||
467 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
468 | mov a, T1 # T1 = a # MAJB | ||
469 | and b, y3 # y3 = (a|c)&b # MAJA | ||
470 | and c, T1 # T1 = a&c # MAJB | ||
471 | add y0, y2 # y2 = S1 + CH # -- | ||
472 | |||
473 | add h, d # d = k + w + h + d # -- | ||
474 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
475 | add y1, h # h = k + w + h + S0 # -- | ||
476 | |||
477 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
478 | |||
479 | RotateState | ||
480 | |||
481 | ################################### RND N + 2 ######################################### | ||
482 | |||
483 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
484 | mov f, y2 # y2 = f # CH | ||
485 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
486 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
487 | xor g, y2 # y2 = f^g # CH | ||
488 | |||
489 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
490 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
491 | and e, y2 # y2 = (f^g)&e # CH | ||
492 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
493 | |||
494 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
495 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
496 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
497 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
498 | mov a, y3 # y3 = a # MAJA | ||
499 | |||
500 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
501 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
502 | add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- | ||
503 | or c, y3 # y3 = a|c # MAJA | ||
504 | |||
505 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
506 | mov a, T1 # T1 = a # MAJB | ||
507 | and b, y3 # y3 = (a|c)&b # MAJA | ||
508 | and c, T1 # T1 = a&c # MAJB | ||
509 | add y0, y2 # y2 = S1 + CH # -- | ||
510 | |||
511 | add h, d # d = k + w + h + d # -- | ||
512 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
513 | add y1, h # h = k + w + h + S0 # -- | ||
514 | |||
515 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
516 | |||
517 | RotateState | ||
518 | |||
519 | ################################### RND N + 3 ######################################### | ||
520 | |||
521 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
522 | mov f, y2 # y2 = f # CH | ||
523 | rorx $41, e, y0 # y0 = e >> 41 # S1A | ||
524 | rorx $18, e, y1 # y1 = e >> 18 # S1B | ||
525 | xor g, y2 # y2 = f^g # CH | ||
526 | |||
527 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 | ||
528 | rorx $14, e, y1 # y1 = (e >> 14) # S1 | ||
529 | and e, y2 # y2 = (f^g)&e # CH | ||
530 | add y3, old_h # h = t1 + S0 + MAJ # -- | ||
531 | |||
532 | xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 | ||
533 | rorx $34, a, T1 # T1 = a >> 34 # S0B | ||
534 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | ||
535 | rorx $39, a, y1 # y1 = a >> 39 # S0A | ||
536 | mov a, y3 # y3 = a # MAJA | ||
537 | |||
538 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 | ||
539 | rorx $28, a, T1 # T1 = (a >> 28) # S0 | ||
540 | add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- | ||
541 | or c, y3 # y3 = a|c # MAJA | ||
542 | |||
543 | xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 | ||
544 | mov a, T1 # T1 = a # MAJB | ||
545 | and b, y3 # y3 = (a|c)&b # MAJA | ||
546 | and c, T1 # T1 = a&c # MAJB | ||
547 | add y0, y2 # y2 = S1 + CH # -- | ||
548 | |||
549 | |||
550 | add h, d # d = k + w + h + d # -- | ||
551 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | ||
552 | add y1, h # h = k + w + h + S0 # -- | ||
553 | |||
554 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | ||
555 | |||
556 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | ||
557 | |||
558 | add y3, h # h = t1 + S0 + MAJ # -- | ||
559 | |||
560 | RotateState | ||
561 | |||
562 | .endm | ||
563 | |||
564 | ######################################################################## | ||
565 | # void sha512_transform_rorx(const void* M, void* D, uint64_t L)# | ||
566 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | ||
567 | # The size of the message pointed to by M must be an integer multiple of SHA512 | ||
568 | # message blocks. | ||
569 | # L is the message length in SHA512 blocks | ||
570 | ######################################################################## | ||
571 | ENTRY(sha512_transform_rorx) | ||
572 | # Allocate Stack Space | ||
573 | mov %rsp, %rax | ||
574 | sub $frame_size, %rsp | ||
575 | and $~(0x20 - 1), %rsp | ||
576 | mov %rax, frame_RSPSAVE(%rsp) | ||
577 | |||
578 | # Save GPRs | ||
579 | mov %rbp, frame_GPRSAVE(%rsp) | ||
580 | mov %rbx, 8*1+frame_GPRSAVE(%rsp) | ||
581 | mov %r12, 8*2+frame_GPRSAVE(%rsp) | ||
582 | mov %r13, 8*3+frame_GPRSAVE(%rsp) | ||
583 | mov %r14, 8*4+frame_GPRSAVE(%rsp) | ||
584 | mov %r15, 8*5+frame_GPRSAVE(%rsp) | ||
585 | |||
586 | shl $7, NUM_BLKS # convert to bytes | ||
587 | jz done_hash | ||
588 | add INP, NUM_BLKS # pointer to end of data | ||
589 | mov NUM_BLKS, frame_INPEND(%rsp) | ||
590 | |||
591 | ## load initial digest | ||
592 | mov 8*0(CTX),a | ||
593 | mov 8*1(CTX),b | ||
594 | mov 8*2(CTX),c | ||
595 | mov 8*3(CTX),d | ||
596 | mov 8*4(CTX),e | ||
597 | mov 8*5(CTX),f | ||
598 | mov 8*6(CTX),g | ||
599 | mov 8*7(CTX),h | ||
600 | |||
601 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | ||
602 | |||
603 | loop0: | ||
604 | lea K512(%rip), TBL | ||
605 | |||
606 | ## byte swap first 16 dwords | ||
607 | COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK | ||
608 | COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK | ||
609 | COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK | ||
610 | COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK | ||
611 | |||
612 | mov INP, frame_INP(%rsp) | ||
613 | |||
614 | ## schedule 64 input dwords, by doing 12 rounds of 4 each | ||
615 | movq $4, frame_SRND(%rsp) | ||
616 | |||
617 | .align 16 | ||
618 | loop1: | ||
619 | vpaddq (TBL), Y_0, XFER | ||
620 | vmovdqa XFER, frame_XFER(%rsp) | ||
621 | FOUR_ROUNDS_AND_SCHED | ||
622 | |||
623 | vpaddq 1*32(TBL), Y_0, XFER | ||
624 | vmovdqa XFER, frame_XFER(%rsp) | ||
625 | FOUR_ROUNDS_AND_SCHED | ||
626 | |||
627 | vpaddq 2*32(TBL), Y_0, XFER | ||
628 | vmovdqa XFER, frame_XFER(%rsp) | ||
629 | FOUR_ROUNDS_AND_SCHED | ||
630 | |||
631 | vpaddq 3*32(TBL), Y_0, XFER | ||
632 | vmovdqa XFER, frame_XFER(%rsp) | ||
633 | add $(4*32), TBL | ||
634 | FOUR_ROUNDS_AND_SCHED | ||
635 | |||
636 | subq $1, frame_SRND(%rsp) | ||
637 | jne loop1 | ||
638 | |||
639 | movq $2, frame_SRND(%rsp) | ||
640 | loop2: | ||
641 | vpaddq (TBL), Y_0, XFER | ||
642 | vmovdqa XFER, frame_XFER(%rsp) | ||
643 | DO_4ROUNDS | ||
644 | vpaddq 1*32(TBL), Y_1, XFER | ||
645 | vmovdqa XFER, frame_XFER(%rsp) | ||
646 | add $(2*32), TBL | ||
647 | DO_4ROUNDS | ||
648 | |||
649 | vmovdqa Y_2, Y_0 | ||
650 | vmovdqa Y_3, Y_1 | ||
651 | |||
652 | subq $1, frame_SRND(%rsp) | ||
653 | jne loop2 | ||
654 | |||
655 | addm 8*0(CTX),a | ||
656 | addm 8*1(CTX),b | ||
657 | addm 8*2(CTX),c | ||
658 | addm 8*3(CTX),d | ||
659 | addm 8*4(CTX),e | ||
660 | addm 8*5(CTX),f | ||
661 | addm 8*6(CTX),g | ||
662 | addm 8*7(CTX),h | ||
663 | |||
664 | mov frame_INP(%rsp), INP | ||
665 | add $128, INP | ||
666 | cmp frame_INPEND(%rsp), INP | ||
667 | jne loop0 | ||
668 | |||
669 | done_hash: | ||
670 | |||
671 | # Restore GPRs | ||
672 | mov frame_GPRSAVE(%rsp) ,%rbp | ||
673 | mov 8*1+frame_GPRSAVE(%rsp) ,%rbx | ||
674 | mov 8*2+frame_GPRSAVE(%rsp) ,%r12 | ||
675 | mov 8*3+frame_GPRSAVE(%rsp) ,%r13 | ||
676 | mov 8*4+frame_GPRSAVE(%rsp) ,%r14 | ||
677 | mov 8*5+frame_GPRSAVE(%rsp) ,%r15 | ||
678 | |||
679 | # Restore Stack Pointer | ||
680 | mov frame_RSPSAVE(%rsp), %rsp | ||
681 | ret | ||
682 | ENDPROC(sha512_transform_rorx) | ||
683 | |||
684 | ######################################################################## | ||
685 | ### Binary Data | ||
686 | |||
687 | .data | ||
688 | |||
689 | .align 64 | ||
690 | # K[t] used in SHA512 hashing | ||
691 | K512: | ||
692 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd | ||
693 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc | ||
694 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 | ||
695 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 | ||
696 | .quad 0xd807aa98a3030242,0x12835b0145706fbe | ||
697 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 | ||
698 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 | ||
699 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 | ||
700 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 | ||
701 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 | ||
702 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 | ||
703 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 | ||
704 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 | ||
705 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 | ||
706 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 | ||
707 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 | ||
708 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 | ||
709 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df | ||
710 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 | ||
711 | .quad 0x81c2c92e47edaee6,0x92722c851482353b | ||
712 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 | ||
713 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 | ||
714 | .quad 0xd192e819d6ef5218,0xd69906245565a910 | ||
715 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 | ||
716 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 | ||
717 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 | ||
718 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb | ||
719 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 | ||
720 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 | ||
721 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec | ||
722 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 | ||
723 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b | ||
724 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 | ||
725 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | ||
726 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 | ||
727 | .quad 0x113f9804bef90dae,0x1b710b35131c471b | ||
728 | .quad 0x28db77f523047d84,0x32caab7b40c72493 | ||
729 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | ||
730 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | ||
731 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | ||
732 | |||
733 | .align 32 | ||
734 | |||
735 | # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. | ||
736 | PSHUFFLE_BYTE_FLIP_MASK: | ||
737 | .octa 0x08090a0b0c0d0e0f0001020304050607 | ||
738 | .octa 0x18191a1b1c1d1e1f1011121314151617 | ||
739 | |||
740 | MASK_YMM_LO: | ||
741 | .octa 0x00000000000000000000000000000000 | ||
742 | .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF | ||
743 | #endif | ||
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S new file mode 100644 index 000000000000..fb56855d51f5 --- /dev/null +++ b/arch/x86/crypto/sha512-ssse3-asm.S | |||
@@ -0,0 +1,421 @@ | |||
1 | ######################################################################## | ||
2 | # Implement fast SHA-512 with SSSE3 instructions. (x86_64) | ||
3 | # | ||
4 | # Copyright (C) 2013 Intel Corporation. | ||
5 | # | ||
6 | # Authors: | ||
7 | # James Guilford <james.guilford@intel.com> | ||
8 | # Kirk Yap <kirk.s.yap@intel.com> | ||
9 | # David Cote <david.m.cote@intel.com> | ||
10 | # Tim Chen <tim.c.chen@linux.intel.com> | ||
11 | # | ||
12 | # This software is available to you under a choice of one of two | ||
13 | # licenses. You may choose to be licensed under the terms of the GNU | ||
14 | # General Public License (GPL) Version 2, available from the file | ||
15 | # COPYING in the main directory of this source tree, or the | ||
16 | # OpenIB.org BSD license below: | ||
17 | # | ||
18 | # Redistribution and use in source and binary forms, with or | ||
19 | # without modification, are permitted provided that the following | ||
20 | # conditions are met: | ||
21 | # | ||
22 | # - Redistributions of source code must retain the above | ||
23 | # copyright notice, this list of conditions and the following | ||
24 | # disclaimer. | ||
25 | # | ||
26 | # - Redistributions in binary form must reproduce the above | ||
27 | # copyright notice, this list of conditions and the following | ||
28 | # disclaimer in the documentation and/or other materials | ||
29 | # provided with the distribution. | ||
30 | # | ||
31 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
32 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
33 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
34 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
35 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
36 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
37 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
38 | # SOFTWARE. | ||
39 | # | ||
40 | ######################################################################## | ||
41 | # | ||
42 | # This code is described in an Intel White-Paper: | ||
43 | # "Fast SHA-512 Implementations on Intel Architecture Processors" | ||
44 | # | ||
45 | # To find it, surf to http://www.intel.com/p/en_US/embedded | ||
46 | # and search for that title. | ||
47 | # | ||
48 | ######################################################################## | ||
49 | |||
50 | #include <linux/linkage.h> | ||
51 | |||
52 | .text | ||
53 | |||
54 | # Virtual Registers | ||
55 | # ARG1 | ||
56 | msg = %rdi | ||
57 | # ARG2 | ||
58 | digest = %rsi | ||
59 | # ARG3 | ||
60 | msglen = %rdx | ||
61 | T1 = %rcx | ||
62 | T2 = %r8 | ||
63 | a_64 = %r9 | ||
64 | b_64 = %r10 | ||
65 | c_64 = %r11 | ||
66 | d_64 = %r12 | ||
67 | e_64 = %r13 | ||
68 | f_64 = %r14 | ||
69 | g_64 = %r15 | ||
70 | h_64 = %rbx | ||
71 | tmp0 = %rax | ||
72 | |||
73 | # Local variables (stack frame) | ||
74 | |||
75 | W_SIZE = 80*8 | ||
76 | WK_SIZE = 2*8 | ||
77 | RSPSAVE_SIZE = 1*8 | ||
78 | GPRSAVE_SIZE = 5*8 | ||
79 | |||
80 | frame_W = 0 | ||
81 | frame_WK = frame_W + W_SIZE | ||
82 | frame_RSPSAVE = frame_WK + WK_SIZE | ||
83 | frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE | ||
84 | frame_size = frame_GPRSAVE + GPRSAVE_SIZE | ||
85 | |||
86 | # Useful QWORD "arrays" for simpler memory references | ||
87 | # MSG, DIGEST, K_t, W_t are arrays | ||
88 | # WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even | ||
89 | |||
90 | # Input message (arg1) | ||
91 | #define MSG(i) 8*i(msg) | ||
92 | |||
93 | # Output Digest (arg2) | ||
94 | #define DIGEST(i) 8*i(digest) | ||
95 | |||
96 | # SHA Constants (static mem) | ||
97 | #define K_t(i) 8*i+K512(%rip) | ||
98 | |||
99 | # Message Schedule (stack frame) | ||
100 | #define W_t(i) 8*i+frame_W(%rsp) | ||
101 | |||
102 | # W[t]+K[t] (stack frame) | ||
103 | #define WK_2(i) 8*((i%2))+frame_WK(%rsp) | ||
104 | |||
105 | .macro RotateState | ||
106 | # Rotate symbols a..h right | ||
107 | TMP = h_64 | ||
108 | h_64 = g_64 | ||
109 | g_64 = f_64 | ||
110 | f_64 = e_64 | ||
111 | e_64 = d_64 | ||
112 | d_64 = c_64 | ||
113 | c_64 = b_64 | ||
114 | b_64 = a_64 | ||
115 | a_64 = TMP | ||
116 | .endm | ||
117 | |||
118 | .macro SHA512_Round rnd | ||
119 | |||
120 | # Compute Round %%t | ||
121 | mov f_64, T1 # T1 = f | ||
122 | mov e_64, tmp0 # tmp = e | ||
123 | xor g_64, T1 # T1 = f ^ g | ||
124 | ror $23, tmp0 # 41 # tmp = e ror 23 | ||
125 | and e_64, T1 # T1 = (f ^ g) & e | ||
126 | xor e_64, tmp0 # tmp = (e ror 23) ^ e | ||
127 | xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) | ||
128 | idx = \rnd | ||
129 | add WK_2(idx), T1 # W[t] + K[t] from message scheduler | ||
130 | ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 | ||
131 | xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e | ||
132 | mov a_64, T2 # T2 = a | ||
133 | add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h | ||
134 | ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) | ||
135 | add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) | ||
136 | mov a_64, tmp0 # tmp = a | ||
137 | xor c_64, T2 # T2 = a ^ c | ||
138 | and c_64, tmp0 # tmp = a & c | ||
139 | and b_64, T2 # T2 = (a ^ c) & b | ||
140 | xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) | ||
141 | mov a_64, tmp0 # tmp = a | ||
142 | ror $5, tmp0 # 39 # tmp = a ror 5 | ||
143 | xor a_64, tmp0 # tmp = (a ror 5) ^ a | ||
144 | add T1, d_64 # e(next_state) = d + T1 | ||
145 | ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 | ||
146 | xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a | ||
147 | lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) | ||
148 | ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) | ||
149 | add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) | ||
150 | RotateState | ||
151 | .endm | ||
152 | |||
153 | .macro SHA512_2Sched_2Round_sse rnd | ||
154 | |||
155 | # Compute rounds t-2 and t-1 | ||
156 | # Compute message schedule QWORDS t and t+1 | ||
157 | |||
158 | # Two rounds are computed based on the values for K[t-2]+W[t-2] and | ||
159 | # K[t-1]+W[t-1] which were previously stored at WK_2 by the message | ||
160 | # scheduler. | ||
161 | # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. | ||
162 | # They are then added to their respective SHA512 constants at | ||
163 | # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] | ||
164 | # For brievity, the comments following vectored instructions only refer to | ||
165 | # the first of a pair of QWORDS. | ||
166 | # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} | ||
167 | # The computation of the message schedule and the rounds are tightly | ||
168 | # stitched to take advantage of instruction-level parallelism. | ||
169 | # For clarity, integer instructions (for the rounds calculation) are indented | ||
170 | # by one tab. Vectored instructions (for the message scheduler) are indented | ||
171 | # by two tabs. | ||
172 | |||
173 | mov f_64, T1 | ||
174 | idx = \rnd -2 | ||
175 | movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] | ||
176 | xor g_64, T1 | ||
177 | and e_64, T1 | ||
178 | movdqa %xmm2, %xmm0 # XMM0 = W[t-2] | ||
179 | xor g_64, T1 | ||
180 | idx = \rnd | ||
181 | add WK_2(idx), T1 | ||
182 | idx = \rnd - 15 | ||
183 | movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] | ||
184 | mov e_64, tmp0 | ||
185 | ror $23, tmp0 # 41 | ||
186 | movdqa %xmm5, %xmm3 # XMM3 = W[t-15] | ||
187 | xor e_64, tmp0 | ||
188 | ror $4, tmp0 # 18 | ||
189 | psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 | ||
190 | xor e_64, tmp0 | ||
191 | ror $14, tmp0 # 14 | ||
192 | psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 | ||
193 | add tmp0, T1 | ||
194 | add h_64, T1 | ||
195 | pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] | ||
196 | mov a_64, T2 | ||
197 | xor c_64, T2 | ||
198 | pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] | ||
199 | and b_64, T2 | ||
200 | mov a_64, tmp0 | ||
201 | psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 | ||
202 | and c_64, tmp0 | ||
203 | xor tmp0, T2 | ||
204 | psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 | ||
205 | mov a_64, tmp0 | ||
206 | ror $5, tmp0 # 39 | ||
207 | pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] | ||
208 | xor a_64, tmp0 | ||
209 | ror $6, tmp0 # 34 | ||
210 | pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] | ||
211 | xor a_64, tmp0 | ||
212 | ror $28, tmp0 # 28 | ||
213 | psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 | ||
214 | add tmp0, T2 | ||
215 | add T1, d_64 | ||
216 | psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 | ||
217 | lea (T1, T2), h_64 | ||
218 | RotateState | ||
219 | movdqa %xmm2, %xmm1 # XMM1 = W[t-2] | ||
220 | mov f_64, T1 | ||
221 | xor g_64, T1 | ||
222 | movdqa %xmm5, %xmm4 # XMM4 = W[t-15] | ||
223 | and e_64, T1 | ||
224 | xor g_64, T1 | ||
225 | psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 | ||
226 | idx = \rnd + 1 | ||
227 | add WK_2(idx), T1 | ||
228 | mov e_64, tmp0 | ||
229 | psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 | ||
230 | ror $23, tmp0 # 41 | ||
231 | xor e_64, tmp0 | ||
232 | pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] | ||
233 | ror $4, tmp0 # 18 | ||
234 | xor e_64, tmp0 | ||
235 | pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] | ||
236 | ror $14, tmp0 # 14 | ||
237 | add tmp0, T1 | ||
238 | psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 | ||
239 | add h_64, T1 | ||
240 | mov a_64, T2 | ||
241 | psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 | ||
242 | xor c_64, T2 | ||
243 | and b_64, T2 | ||
244 | pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) | ||
245 | mov a_64, tmp0 | ||
246 | and c_64, tmp0 | ||
247 | idx = \rnd - 7 | ||
248 | movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] | ||
249 | xor tmp0, T2 | ||
250 | pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) | ||
251 | mov a_64, tmp0 | ||
252 | paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) | ||
253 | ror $5, tmp0 # 39 | ||
254 | idx =\rnd-16 | ||
255 | paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] | ||
256 | xor a_64, tmp0 | ||
257 | paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] | ||
258 | ror $6, tmp0 # 34 | ||
259 | movdqa %xmm0, W_t(\rnd) # Store scheduled qwords | ||
260 | xor a_64, tmp0 | ||
261 | paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] | ||
262 | ror $28, tmp0 # 28 | ||
263 | idx = \rnd | ||
264 | movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds | ||
265 | add tmp0, T2 | ||
266 | add T1, d_64 | ||
267 | lea (T1, T2), h_64 | ||
268 | RotateState | ||
269 | .endm | ||
270 | |||
271 | ######################################################################## | ||
272 | # void sha512_transform_ssse3(const void* M, void* D, u64 L)# | ||
273 | # Purpose: Updates the SHA512 digest stored at D with the message stored in M. | ||
274 | # The size of the message pointed to by M must be an integer multiple of SHA512 | ||
275 | # message blocks. | ||
276 | # L is the message length in SHA512 blocks. | ||
277 | ######################################################################## | ||
278 | ENTRY(sha512_transform_ssse3) | ||
279 | |||
280 | cmp $0, msglen | ||
281 | je nowork | ||
282 | |||
283 | # Allocate Stack Space | ||
284 | mov %rsp, %rax | ||
285 | sub $frame_size, %rsp | ||
286 | and $~(0x20 - 1), %rsp | ||
287 | mov %rax, frame_RSPSAVE(%rsp) | ||
288 | |||
289 | # Save GPRs | ||
290 | mov %rbx, frame_GPRSAVE(%rsp) | ||
291 | mov %r12, frame_GPRSAVE +8*1(%rsp) | ||
292 | mov %r13, frame_GPRSAVE +8*2(%rsp) | ||
293 | mov %r14, frame_GPRSAVE +8*3(%rsp) | ||
294 | mov %r15, frame_GPRSAVE +8*4(%rsp) | ||
295 | |||
296 | updateblock: | ||
297 | |||
298 | # Load state variables | ||
299 | mov DIGEST(0), a_64 | ||
300 | mov DIGEST(1), b_64 | ||
301 | mov DIGEST(2), c_64 | ||
302 | mov DIGEST(3), d_64 | ||
303 | mov DIGEST(4), e_64 | ||
304 | mov DIGEST(5), f_64 | ||
305 | mov DIGEST(6), g_64 | ||
306 | mov DIGEST(7), h_64 | ||
307 | |||
308 | t = 0 | ||
309 | .rept 80/2 + 1 | ||
310 | # (80 rounds) / (2 rounds/iteration) + (1 iteration) | ||
311 | # +1 iteration because the scheduler leads hashing by 1 iteration | ||
312 | .if t < 2 | ||
313 | # BSWAP 2 QWORDS | ||
314 | movdqa XMM_QWORD_BSWAP(%rip), %xmm1 | ||
315 | movdqu MSG(t), %xmm0 | ||
316 | pshufb %xmm1, %xmm0 # BSWAP | ||
317 | movdqa %xmm0, W_t(t) # Store Scheduled Pair | ||
318 | paddq K_t(t), %xmm0 # Compute W[t]+K[t] | ||
319 | movdqa %xmm0, WK_2(t) # Store into WK for rounds | ||
320 | .elseif t < 16 | ||
321 | # BSWAP 2 QWORDS# Compute 2 Rounds | ||
322 | movdqu MSG(t), %xmm0 | ||
323 | pshufb %xmm1, %xmm0 # BSWAP | ||
324 | SHA512_Round t-2 # Round t-2 | ||
325 | movdqa %xmm0, W_t(t) # Store Scheduled Pair | ||
326 | paddq K_t(t), %xmm0 # Compute W[t]+K[t] | ||
327 | SHA512_Round t-1 # Round t-1 | ||
328 | movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK | ||
329 | .elseif t < 79 | ||
330 | # Schedule 2 QWORDS# Compute 2 Rounds | ||
331 | SHA512_2Sched_2Round_sse t | ||
332 | .else | ||
333 | # Compute 2 Rounds | ||
334 | SHA512_Round t-2 | ||
335 | SHA512_Round t-1 | ||
336 | .endif | ||
337 | t = t+2 | ||
338 | .endr | ||
339 | |||
340 | # Update digest | ||
341 | add a_64, DIGEST(0) | ||
342 | add b_64, DIGEST(1) | ||
343 | add c_64, DIGEST(2) | ||
344 | add d_64, DIGEST(3) | ||
345 | add e_64, DIGEST(4) | ||
346 | add f_64, DIGEST(5) | ||
347 | add g_64, DIGEST(6) | ||
348 | add h_64, DIGEST(7) | ||
349 | |||
350 | # Advance to next message block | ||
351 | add $16*8, msg | ||
352 | dec msglen | ||
353 | jnz updateblock | ||
354 | |||
355 | # Restore GPRs | ||
356 | mov frame_GPRSAVE(%rsp), %rbx | ||
357 | mov frame_GPRSAVE +8*1(%rsp), %r12 | ||
358 | mov frame_GPRSAVE +8*2(%rsp), %r13 | ||
359 | mov frame_GPRSAVE +8*3(%rsp), %r14 | ||
360 | mov frame_GPRSAVE +8*4(%rsp), %r15 | ||
361 | |||
362 | # Restore Stack Pointer | ||
363 | mov frame_RSPSAVE(%rsp), %rsp | ||
364 | |||
365 | nowork: | ||
366 | ret | ||
367 | ENDPROC(sha512_transform_ssse3) | ||
368 | |||
369 | ######################################################################## | ||
370 | ### Binary Data | ||
371 | |||
372 | .data | ||
373 | |||
374 | .align 16 | ||
375 | |||
376 | # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. | ||
377 | XMM_QWORD_BSWAP: | ||
378 | .octa 0x08090a0b0c0d0e0f0001020304050607 | ||
379 | |||
380 | # K[t] used in SHA512 hashing | ||
381 | K512: | ||
382 | .quad 0x428a2f98d728ae22,0x7137449123ef65cd | ||
383 | .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc | ||
384 | .quad 0x3956c25bf348b538,0x59f111f1b605d019 | ||
385 | .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 | ||
386 | .quad 0xd807aa98a3030242,0x12835b0145706fbe | ||
387 | .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 | ||
388 | .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 | ||
389 | .quad 0x9bdc06a725c71235,0xc19bf174cf692694 | ||
390 | .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 | ||
391 | .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 | ||
392 | .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 | ||
393 | .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 | ||
394 | .quad 0x983e5152ee66dfab,0xa831c66d2db43210 | ||
395 | .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 | ||
396 | .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 | ||
397 | .quad 0x06ca6351e003826f,0x142929670a0e6e70 | ||
398 | .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 | ||
399 | .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df | ||
400 | .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 | ||
401 | .quad 0x81c2c92e47edaee6,0x92722c851482353b | ||
402 | .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 | ||
403 | .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 | ||
404 | .quad 0xd192e819d6ef5218,0xd69906245565a910 | ||
405 | .quad 0xf40e35855771202a,0x106aa07032bbd1b8 | ||
406 | .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 | ||
407 | .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 | ||
408 | .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb | ||
409 | .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 | ||
410 | .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 | ||
411 | .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec | ||
412 | .quad 0x90befffa23631e28,0xa4506cebde82bde9 | ||
413 | .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b | ||
414 | .quad 0xca273eceea26619c,0xd186b8c721c0c207 | ||
415 | .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 | ||
416 | .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 | ||
417 | .quad 0x113f9804bef90dae,0x1b710b35131c471b | ||
418 | .quad 0x28db77f523047d84,0x32caab7b40c72493 | ||
419 | .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c | ||
420 | .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a | ||
421 | .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 | ||
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c new file mode 100644 index 000000000000..6cbd8df348d2 --- /dev/null +++ b/arch/x86/crypto/sha512_ssse3_glue.c | |||
@@ -0,0 +1,282 @@ | |||
1 | /* | ||
2 | * Cryptographic API. | ||
3 | * | ||
4 | * Glue code for the SHA512 Secure Hash Algorithm assembler | ||
5 | * implementation using supplemental SSE3 / AVX / AVX2 instructions. | ||
6 | * | ||
7 | * This file is based on sha512_generic.c | ||
8 | * | ||
9 | * Copyright (C) 2013 Intel Corporation | ||
10 | * Author: Tim Chen <tim.c.chen@linux.intel.com> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify it | ||
13 | * under the terms of the GNU General Public License as published by the Free | ||
14 | * Software Foundation; either version 2 of the License, or (at your option) | ||
15 | * any later version. | ||
16 | * | ||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
19 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
21 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
22 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
23 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
24 | * SOFTWARE. | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
29 | |||
30 | #include <crypto/internal/hash.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/mm.h> | ||
34 | #include <linux/cryptohash.h> | ||
35 | #include <linux/types.h> | ||
36 | #include <crypto/sha.h> | ||
37 | #include <asm/byteorder.h> | ||
38 | #include <asm/i387.h> | ||
39 | #include <asm/xcr.h> | ||
40 | #include <asm/xsave.h> | ||
41 | |||
42 | #include <linux/string.h> | ||
43 | |||
44 | asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, | ||
45 | u64 rounds); | ||
46 | #ifdef CONFIG_AS_AVX | ||
47 | asmlinkage void sha512_transform_avx(const char *data, u64 *digest, | ||
48 | u64 rounds); | ||
49 | #endif | ||
50 | #ifdef CONFIG_AS_AVX2 | ||
51 | asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, | ||
52 | u64 rounds); | ||
53 | #endif | ||
54 | |||
55 | static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); | ||
56 | |||
57 | |||
58 | static int sha512_ssse3_init(struct shash_desc *desc) | ||
59 | { | ||
60 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
61 | |||
62 | sctx->state[0] = SHA512_H0; | ||
63 | sctx->state[1] = SHA512_H1; | ||
64 | sctx->state[2] = SHA512_H2; | ||
65 | sctx->state[3] = SHA512_H3; | ||
66 | sctx->state[4] = SHA512_H4; | ||
67 | sctx->state[5] = SHA512_H5; | ||
68 | sctx->state[6] = SHA512_H6; | ||
69 | sctx->state[7] = SHA512_H7; | ||
70 | sctx->count[0] = sctx->count[1] = 0; | ||
71 | |||
72 | return 0; | ||
73 | } | ||
74 | |||
75 | static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
76 | unsigned int len, unsigned int partial) | ||
77 | { | ||
78 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
79 | unsigned int done = 0; | ||
80 | |||
81 | sctx->count[0] += len; | ||
82 | if (sctx->count[0] < len) | ||
83 | sctx->count[1]++; | ||
84 | |||
85 | if (partial) { | ||
86 | done = SHA512_BLOCK_SIZE - partial; | ||
87 | memcpy(sctx->buf + partial, data, done); | ||
88 | sha512_transform_asm(sctx->buf, sctx->state, 1); | ||
89 | } | ||
90 | |||
91 | if (len - done >= SHA512_BLOCK_SIZE) { | ||
92 | const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; | ||
93 | |||
94 | sha512_transform_asm(data + done, sctx->state, (u64) rounds); | ||
95 | |||
96 | done += rounds * SHA512_BLOCK_SIZE; | ||
97 | } | ||
98 | |||
99 | memcpy(sctx->buf, data + done, len - done); | ||
100 | |||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, | ||
105 | unsigned int len) | ||
106 | { | ||
107 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
108 | unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; | ||
109 | int res; | ||
110 | |||
111 | /* Handle the fast case right here */ | ||
112 | if (partial + len < SHA512_BLOCK_SIZE) { | ||
113 | sctx->count[0] += len; | ||
114 | if (sctx->count[0] < len) | ||
115 | sctx->count[1]++; | ||
116 | memcpy(sctx->buf + partial, data, len); | ||
117 | |||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | if (!irq_fpu_usable()) { | ||
122 | res = crypto_sha512_update(desc, data, len); | ||
123 | } else { | ||
124 | kernel_fpu_begin(); | ||
125 | res = __sha512_ssse3_update(desc, data, len, partial); | ||
126 | kernel_fpu_end(); | ||
127 | } | ||
128 | |||
129 | return res; | ||
130 | } | ||
131 | |||
132 | |||
133 | /* Add padding and return the message digest. */ | ||
134 | static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) | ||
135 | { | ||
136 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
137 | unsigned int i, index, padlen; | ||
138 | __be64 *dst = (__be64 *)out; | ||
139 | __be64 bits[2]; | ||
140 | static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; | ||
141 | |||
142 | /* save number of bits */ | ||
143 | bits[1] = cpu_to_be64(sctx->count[0] << 3); | ||
144 | bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61; | ||
145 | |||
146 | /* Pad out to 112 mod 128 and append length */ | ||
147 | index = sctx->count[0] & 0x7f; | ||
148 | padlen = (index < 112) ? (112 - index) : ((128+112) - index); | ||
149 | |||
150 | if (!irq_fpu_usable()) { | ||
151 | crypto_sha512_update(desc, padding, padlen); | ||
152 | crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
153 | } else { | ||
154 | kernel_fpu_begin(); | ||
155 | /* We need to fill a whole block for __sha512_ssse3_update() */ | ||
156 | if (padlen <= 112) { | ||
157 | sctx->count[0] += padlen; | ||
158 | if (sctx->count[0] < padlen) | ||
159 | sctx->count[1]++; | ||
160 | memcpy(sctx->buf + index, padding, padlen); | ||
161 | } else { | ||
162 | __sha512_ssse3_update(desc, padding, padlen, index); | ||
163 | } | ||
164 | __sha512_ssse3_update(desc, (const u8 *)&bits, | ||
165 | sizeof(bits), 112); | ||
166 | kernel_fpu_end(); | ||
167 | } | ||
168 | |||
169 | /* Store state in digest */ | ||
170 | for (i = 0; i < 8; i++) | ||
171 | dst[i] = cpu_to_be64(sctx->state[i]); | ||
172 | |||
173 | /* Wipe context */ | ||
174 | memset(sctx, 0, sizeof(*sctx)); | ||
175 | |||
176 | return 0; | ||
177 | } | ||
178 | |||
179 | static int sha512_ssse3_export(struct shash_desc *desc, void *out) | ||
180 | { | ||
181 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
182 | |||
183 | memcpy(out, sctx, sizeof(*sctx)); | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static int sha512_ssse3_import(struct shash_desc *desc, const void *in) | ||
189 | { | ||
190 | struct sha512_state *sctx = shash_desc_ctx(desc); | ||
191 | |||
192 | memcpy(sctx, in, sizeof(*sctx)); | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static struct shash_alg alg = { | ||
198 | .digestsize = SHA512_DIGEST_SIZE, | ||
199 | .init = sha512_ssse3_init, | ||
200 | .update = sha512_ssse3_update, | ||
201 | .final = sha512_ssse3_final, | ||
202 | .export = sha512_ssse3_export, | ||
203 | .import = sha512_ssse3_import, | ||
204 | .descsize = sizeof(struct sha512_state), | ||
205 | .statesize = sizeof(struct sha512_state), | ||
206 | .base = { | ||
207 | .cra_name = "sha512", | ||
208 | .cra_driver_name = "sha512-ssse3", | ||
209 | .cra_priority = 150, | ||
210 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
211 | .cra_blocksize = SHA512_BLOCK_SIZE, | ||
212 | .cra_module = THIS_MODULE, | ||
213 | } | ||
214 | }; | ||
215 | |||
216 | #ifdef CONFIG_AS_AVX | ||
217 | static bool __init avx_usable(void) | ||
218 | { | ||
219 | u64 xcr0; | ||
220 | |||
221 | if (!cpu_has_avx || !cpu_has_osxsave) | ||
222 | return false; | ||
223 | |||
224 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
225 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
226 | pr_info("AVX detected but unusable.\n"); | ||
227 | |||
228 | return false; | ||
229 | } | ||
230 | |||
231 | return true; | ||
232 | } | ||
233 | #endif | ||
234 | |||
235 | static int __init sha512_ssse3_mod_init(void) | ||
236 | { | ||
237 | /* test for SSE3 first */ | ||
238 | if (cpu_has_ssse3) | ||
239 | sha512_transform_asm = sha512_transform_ssse3; | ||
240 | |||
241 | #ifdef CONFIG_AS_AVX | ||
242 | /* allow AVX to override SSSE3, it's a little faster */ | ||
243 | if (avx_usable()) { | ||
244 | #ifdef CONFIG_AS_AVX2 | ||
245 | if (boot_cpu_has(X86_FEATURE_AVX2)) | ||
246 | sha512_transform_asm = sha512_transform_rorx; | ||
247 | else | ||
248 | #endif | ||
249 | sha512_transform_asm = sha512_transform_avx; | ||
250 | } | ||
251 | #endif | ||
252 | |||
253 | if (sha512_transform_asm) { | ||
254 | #ifdef CONFIG_AS_AVX | ||
255 | if (sha512_transform_asm == sha512_transform_avx) | ||
256 | pr_info("Using AVX optimized SHA-512 implementation\n"); | ||
257 | #ifdef CONFIG_AS_AVX2 | ||
258 | else if (sha512_transform_asm == sha512_transform_rorx) | ||
259 | pr_info("Using AVX2 optimized SHA-512 implementation\n"); | ||
260 | #endif | ||
261 | else | ||
262 | #endif | ||
263 | pr_info("Using SSSE3 optimized SHA-512 implementation\n"); | ||
264 | return crypto_register_shash(&alg); | ||
265 | } | ||
266 | pr_info("Neither AVX nor SSSE3 is available/usable.\n"); | ||
267 | |||
268 | return -ENODEV; | ||
269 | } | ||
270 | |||
271 | static void __exit sha512_ssse3_mod_fini(void) | ||
272 | { | ||
273 | crypto_unregister_shash(&alg); | ||
274 | } | ||
275 | |||
276 | module_init(sha512_ssse3_mod_init); | ||
277 | module_exit(sha512_ssse3_mod_fini); | ||
278 | |||
279 | MODULE_LICENSE("GPL"); | ||
280 | MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); | ||
281 | |||
282 | MODULE_ALIAS("sha512"); | ||
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 8d3e113b2c95..05058134c443 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 7 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
8 | * | 8 | * |
9 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
@@ -33,6 +33,8 @@ | |||
33 | 33 | ||
34 | .Lbswap128_mask: | 34 | .Lbswap128_mask: |
35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
36 | .Lxts_gf128mul_and_shl1_mask: | ||
37 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
36 | 38 | ||
37 | .text | 39 | .text |
38 | 40 | ||
@@ -408,3 +410,47 @@ ENTRY(twofish_ctr_8way) | |||
408 | 410 | ||
409 | ret; | 411 | ret; |
410 | ENDPROC(twofish_ctr_8way) | 412 | ENDPROC(twofish_ctr_8way) |
413 | |||
414 | ENTRY(twofish_xts_enc_8way) | ||
415 | /* input: | ||
416 | * %rdi: ctx, CTX | ||
417 | * %rsi: dst | ||
418 | * %rdx: src | ||
419 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
420 | */ | ||
421 | |||
422 | movq %rsi, %r11; | ||
423 | |||
424 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | ||
425 | load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, | ||
426 | RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); | ||
427 | |||
428 | call __twofish_enc_blk8; | ||
429 | |||
430 | /* dst <= regs xor IVs(in dst) */ | ||
431 | store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); | ||
432 | |||
433 | ret; | ||
434 | ENDPROC(twofish_xts_enc_8way) | ||
435 | |||
436 | ENTRY(twofish_xts_dec_8way) | ||
437 | /* input: | ||
438 | * %rdi: ctx, CTX | ||
439 | * %rsi: dst | ||
440 | * %rdx: src | ||
441 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
442 | */ | ||
443 | |||
444 | movq %rsi, %r11; | ||
445 | |||
446 | /* regs <= src, dst <= IVs, regs <= regs xor IVs */ | ||
447 | load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, | ||
448 | RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); | ||
449 | |||
450 | call __twofish_dec_blk8; | ||
451 | |||
452 | /* dst <= regs xor IVs(in dst) */ | ||
453 | store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
454 | |||
455 | ret; | ||
456 | ENDPROC(twofish_xts_dec_8way) | ||
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S new file mode 100644 index 000000000000..e1a83b9cd389 --- /dev/null +++ b/arch/x86/crypto/twofish-avx2-asm_64.S | |||
@@ -0,0 +1,600 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2 assembler optimized version of Twofish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | #include "glue_helper-asm-avx2.S" | ||
15 | |||
16 | .file "twofish-avx2-asm_64.S" | ||
17 | |||
18 | .data | ||
19 | .align 16 | ||
20 | |||
21 | .Lvpshufb_mask0: | ||
22 | .long 0x80808000 | ||
23 | .long 0x80808004 | ||
24 | .long 0x80808008 | ||
25 | .long 0x8080800c | ||
26 | |||
27 | .Lbswap128_mask: | ||
28 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
29 | .Lxts_gf128mul_and_shl1_mask_0: | ||
30 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
31 | .Lxts_gf128mul_and_shl1_mask_1: | ||
32 | .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 | ||
33 | |||
34 | .text | ||
35 | |||
36 | /* structure of crypto context */ | ||
37 | #define s0 0 | ||
38 | #define s1 1024 | ||
39 | #define s2 2048 | ||
40 | #define s3 3072 | ||
41 | #define w 4096 | ||
42 | #define k 4128 | ||
43 | |||
44 | /* register macros */ | ||
45 | #define CTX %rdi | ||
46 | |||
47 | #define RS0 CTX | ||
48 | #define RS1 %r8 | ||
49 | #define RS2 %r9 | ||
50 | #define RS3 %r10 | ||
51 | #define RK %r11 | ||
52 | #define RW %rax | ||
53 | #define RROUND %r12 | ||
54 | #define RROUNDd %r12d | ||
55 | |||
56 | #define RA0 %ymm8 | ||
57 | #define RB0 %ymm9 | ||
58 | #define RC0 %ymm10 | ||
59 | #define RD0 %ymm11 | ||
60 | #define RA1 %ymm12 | ||
61 | #define RB1 %ymm13 | ||
62 | #define RC1 %ymm14 | ||
63 | #define RD1 %ymm15 | ||
64 | |||
65 | /* temp regs */ | ||
66 | #define RX0 %ymm0 | ||
67 | #define RY0 %ymm1 | ||
68 | #define RX1 %ymm2 | ||
69 | #define RY1 %ymm3 | ||
70 | #define RT0 %ymm4 | ||
71 | #define RIDX %ymm5 | ||
72 | |||
73 | #define RX0x %xmm0 | ||
74 | #define RY0x %xmm1 | ||
75 | #define RX1x %xmm2 | ||
76 | #define RY1x %xmm3 | ||
77 | #define RT0x %xmm4 | ||
78 | |||
79 | /* vpgatherdd mask and '-1' */ | ||
80 | #define RNOT %ymm6 | ||
81 | |||
82 | /* byte mask, (-1 >> 24) */ | ||
83 | #define RBYTE %ymm7 | ||
84 | |||
85 | /********************************************************************** | ||
86 | 16-way AVX2 twofish | ||
87 | **********************************************************************/ | ||
88 | #define init_round_constants() \ | ||
89 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
90 | vpsrld $24, RNOT, RBYTE; \ | ||
91 | leaq k(CTX), RK; \ | ||
92 | leaq w(CTX), RW; \ | ||
93 | leaq s1(CTX), RS1; \ | ||
94 | leaq s2(CTX), RS2; \ | ||
95 | leaq s3(CTX), RS3; \ | ||
96 | |||
97 | #define g16(ab, rs0, rs1, rs2, rs3, xy) \ | ||
98 | vpand RBYTE, ab ## 0, RIDX; \ | ||
99 | vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ | ||
100 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
101 | \ | ||
102 | vpand RBYTE, ab ## 1, RIDX; \ | ||
103 | vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ | ||
104 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
105 | \ | ||
106 | vpsrld $8, ab ## 0, RIDX; \ | ||
107 | vpand RBYTE, RIDX, RIDX; \ | ||
108 | vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ | ||
109 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
110 | vpxor RT0, xy ## 0, xy ## 0; \ | ||
111 | \ | ||
112 | vpsrld $8, ab ## 1, RIDX; \ | ||
113 | vpand RBYTE, RIDX, RIDX; \ | ||
114 | vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ | ||
115 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
116 | vpxor RT0, xy ## 1, xy ## 1; \ | ||
117 | \ | ||
118 | vpsrld $16, ab ## 0, RIDX; \ | ||
119 | vpand RBYTE, RIDX, RIDX; \ | ||
120 | vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ | ||
121 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
122 | vpxor RT0, xy ## 0, xy ## 0; \ | ||
123 | \ | ||
124 | vpsrld $16, ab ## 1, RIDX; \ | ||
125 | vpand RBYTE, RIDX, RIDX; \ | ||
126 | vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ | ||
127 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
128 | vpxor RT0, xy ## 1, xy ## 1; \ | ||
129 | \ | ||
130 | vpsrld $24, ab ## 0, RIDX; \ | ||
131 | vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ | ||
132 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
133 | vpxor RT0, xy ## 0, xy ## 0; \ | ||
134 | \ | ||
135 | vpsrld $24, ab ## 1, RIDX; \ | ||
136 | vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ | ||
137 | vpcmpeqd RNOT, RNOT, RNOT; \ | ||
138 | vpxor RT0, xy ## 1, xy ## 1; | ||
139 | |||
140 | #define g1_16(a, x) \ | ||
141 | g16(a, RS0, RS1, RS2, RS3, x); | ||
142 | |||
143 | #define g2_16(b, y) \ | ||
144 | g16(b, RS1, RS2, RS3, RS0, y); | ||
145 | |||
146 | #define encrypt_round_end16(a, b, c, d, nk) \ | ||
147 | vpaddd RY0, RX0, RX0; \ | ||
148 | vpaddd RX0, RY0, RY0; \ | ||
149 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
150 | vpaddd RT0, RX0, RX0; \ | ||
151 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
152 | vpaddd RT0, RY0, RY0; \ | ||
153 | \ | ||
154 | vpxor RY0, d ## 0, d ## 0; \ | ||
155 | \ | ||
156 | vpxor RX0, c ## 0, c ## 0; \ | ||
157 | vpsrld $1, c ## 0, RT0; \ | ||
158 | vpslld $31, c ## 0, c ## 0; \ | ||
159 | vpor RT0, c ## 0, c ## 0; \ | ||
160 | \ | ||
161 | vpaddd RY1, RX1, RX1; \ | ||
162 | vpaddd RX1, RY1, RY1; \ | ||
163 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
164 | vpaddd RT0, RX1, RX1; \ | ||
165 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
166 | vpaddd RT0, RY1, RY1; \ | ||
167 | \ | ||
168 | vpxor RY1, d ## 1, d ## 1; \ | ||
169 | \ | ||
170 | vpxor RX1, c ## 1, c ## 1; \ | ||
171 | vpsrld $1, c ## 1, RT0; \ | ||
172 | vpslld $31, c ## 1, c ## 1; \ | ||
173 | vpor RT0, c ## 1, c ## 1; \ | ||
174 | |||
175 | #define encrypt_round16(a, b, c, d, nk) \ | ||
176 | g2_16(b, RY); \ | ||
177 | \ | ||
178 | vpslld $1, b ## 0, RT0; \ | ||
179 | vpsrld $31, b ## 0, b ## 0; \ | ||
180 | vpor RT0, b ## 0, b ## 0; \ | ||
181 | \ | ||
182 | vpslld $1, b ## 1, RT0; \ | ||
183 | vpsrld $31, b ## 1, b ## 1; \ | ||
184 | vpor RT0, b ## 1, b ## 1; \ | ||
185 | \ | ||
186 | g1_16(a, RX); \ | ||
187 | \ | ||
188 | encrypt_round_end16(a, b, c, d, nk); | ||
189 | |||
190 | #define encrypt_round_first16(a, b, c, d, nk) \ | ||
191 | vpslld $1, d ## 0, RT0; \ | ||
192 | vpsrld $31, d ## 0, d ## 0; \ | ||
193 | vpor RT0, d ## 0, d ## 0; \ | ||
194 | \ | ||
195 | vpslld $1, d ## 1, RT0; \ | ||
196 | vpsrld $31, d ## 1, d ## 1; \ | ||
197 | vpor RT0, d ## 1, d ## 1; \ | ||
198 | \ | ||
199 | encrypt_round16(a, b, c, d, nk); | ||
200 | |||
201 | #define encrypt_round_last16(a, b, c, d, nk) \ | ||
202 | g2_16(b, RY); \ | ||
203 | \ | ||
204 | g1_16(a, RX); \ | ||
205 | \ | ||
206 | encrypt_round_end16(a, b, c, d, nk); | ||
207 | |||
208 | #define decrypt_round_end16(a, b, c, d, nk) \ | ||
209 | vpaddd RY0, RX0, RX0; \ | ||
210 | vpaddd RX0, RY0, RY0; \ | ||
211 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
212 | vpaddd RT0, RX0, RX0; \ | ||
213 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
214 | vpaddd RT0, RY0, RY0; \ | ||
215 | \ | ||
216 | vpxor RX0, c ## 0, c ## 0; \ | ||
217 | \ | ||
218 | vpxor RY0, d ## 0, d ## 0; \ | ||
219 | vpsrld $1, d ## 0, RT0; \ | ||
220 | vpslld $31, d ## 0, d ## 0; \ | ||
221 | vpor RT0, d ## 0, d ## 0; \ | ||
222 | \ | ||
223 | vpaddd RY1, RX1, RX1; \ | ||
224 | vpaddd RX1, RY1, RY1; \ | ||
225 | vpbroadcastd nk(RK,RROUND,8), RT0; \ | ||
226 | vpaddd RT0, RX1, RX1; \ | ||
227 | vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ | ||
228 | vpaddd RT0, RY1, RY1; \ | ||
229 | \ | ||
230 | vpxor RX1, c ## 1, c ## 1; \ | ||
231 | \ | ||
232 | vpxor RY1, d ## 1, d ## 1; \ | ||
233 | vpsrld $1, d ## 1, RT0; \ | ||
234 | vpslld $31, d ## 1, d ## 1; \ | ||
235 | vpor RT0, d ## 1, d ## 1; | ||
236 | |||
237 | #define decrypt_round16(a, b, c, d, nk) \ | ||
238 | g1_16(a, RX); \ | ||
239 | \ | ||
240 | vpslld $1, a ## 0, RT0; \ | ||
241 | vpsrld $31, a ## 0, a ## 0; \ | ||
242 | vpor RT0, a ## 0, a ## 0; \ | ||
243 | \ | ||
244 | vpslld $1, a ## 1, RT0; \ | ||
245 | vpsrld $31, a ## 1, a ## 1; \ | ||
246 | vpor RT0, a ## 1, a ## 1; \ | ||
247 | \ | ||
248 | g2_16(b, RY); \ | ||
249 | \ | ||
250 | decrypt_round_end16(a, b, c, d, nk); | ||
251 | |||
252 | #define decrypt_round_first16(a, b, c, d, nk) \ | ||
253 | vpslld $1, c ## 0, RT0; \ | ||
254 | vpsrld $31, c ## 0, c ## 0; \ | ||
255 | vpor RT0, c ## 0, c ## 0; \ | ||
256 | \ | ||
257 | vpslld $1, c ## 1, RT0; \ | ||
258 | vpsrld $31, c ## 1, c ## 1; \ | ||
259 | vpor RT0, c ## 1, c ## 1; \ | ||
260 | \ | ||
261 | decrypt_round16(a, b, c, d, nk) | ||
262 | |||
263 | #define decrypt_round_last16(a, b, c, d, nk) \ | ||
264 | g1_16(a, RX); \ | ||
265 | \ | ||
266 | g2_16(b, RY); \ | ||
267 | \ | ||
268 | decrypt_round_end16(a, b, c, d, nk); | ||
269 | |||
270 | #define encrypt_cycle16() \ | ||
271 | encrypt_round16(RA, RB, RC, RD, 0); \ | ||
272 | encrypt_round16(RC, RD, RA, RB, 8); | ||
273 | |||
274 | #define encrypt_cycle_first16() \ | ||
275 | encrypt_round_first16(RA, RB, RC, RD, 0); \ | ||
276 | encrypt_round16(RC, RD, RA, RB, 8); | ||
277 | |||
278 | #define encrypt_cycle_last16() \ | ||
279 | encrypt_round16(RA, RB, RC, RD, 0); \ | ||
280 | encrypt_round_last16(RC, RD, RA, RB, 8); | ||
281 | |||
282 | #define decrypt_cycle16(n) \ | ||
283 | decrypt_round16(RC, RD, RA, RB, 8); \ | ||
284 | decrypt_round16(RA, RB, RC, RD, 0); | ||
285 | |||
286 | #define decrypt_cycle_first16(n) \ | ||
287 | decrypt_round_first16(RC, RD, RA, RB, 8); \ | ||
288 | decrypt_round16(RA, RB, RC, RD, 0); | ||
289 | |||
290 | #define decrypt_cycle_last16(n) \ | ||
291 | decrypt_round16(RC, RD, RA, RB, 8); \ | ||
292 | decrypt_round_last16(RA, RB, RC, RD, 0); | ||
293 | |||
294 | #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ | ||
295 | vpunpckhdq x1, x0, t2; \ | ||
296 | vpunpckldq x1, x0, x0; \ | ||
297 | \ | ||
298 | vpunpckldq x3, x2, t1; \ | ||
299 | vpunpckhdq x3, x2, x2; \ | ||
300 | \ | ||
301 | vpunpckhqdq t1, x0, x1; \ | ||
302 | vpunpcklqdq t1, x0, x0; \ | ||
303 | \ | ||
304 | vpunpckhqdq x2, t2, x3; \ | ||
305 | vpunpcklqdq x2, t2, x2; | ||
306 | |||
307 | #define read_blocks8(offs,a,b,c,d) \ | ||
308 | transpose_4x4(a, b, c, d, RX0, RY0); | ||
309 | |||
310 | #define write_blocks8(offs,a,b,c,d) \ | ||
311 | transpose_4x4(a, b, c, d, RX0, RY0); | ||
312 | |||
313 | #define inpack_enc8(a,b,c,d) \ | ||
314 | vpbroadcastd 4*0(RW), RT0; \ | ||
315 | vpxor RT0, a, a; \ | ||
316 | \ | ||
317 | vpbroadcastd 4*1(RW), RT0; \ | ||
318 | vpxor RT0, b, b; \ | ||
319 | \ | ||
320 | vpbroadcastd 4*2(RW), RT0; \ | ||
321 | vpxor RT0, c, c; \ | ||
322 | \ | ||
323 | vpbroadcastd 4*3(RW), RT0; \ | ||
324 | vpxor RT0, d, d; | ||
325 | |||
326 | #define outunpack_enc8(a,b,c,d) \ | ||
327 | vpbroadcastd 4*4(RW), RX0; \ | ||
328 | vpbroadcastd 4*5(RW), RY0; \ | ||
329 | vpxor RX0, c, RX0; \ | ||
330 | vpxor RY0, d, RY0; \ | ||
331 | \ | ||
332 | vpbroadcastd 4*6(RW), RT0; \ | ||
333 | vpxor RT0, a, c; \ | ||
334 | vpbroadcastd 4*7(RW), RT0; \ | ||
335 | vpxor RT0, b, d; \ | ||
336 | \ | ||
337 | vmovdqa RX0, a; \ | ||
338 | vmovdqa RY0, b; | ||
339 | |||
340 | #define inpack_dec8(a,b,c,d) \ | ||
341 | vpbroadcastd 4*4(RW), RX0; \ | ||
342 | vpbroadcastd 4*5(RW), RY0; \ | ||
343 | vpxor RX0, a, RX0; \ | ||
344 | vpxor RY0, b, RY0; \ | ||
345 | \ | ||
346 | vpbroadcastd 4*6(RW), RT0; \ | ||
347 | vpxor RT0, c, a; \ | ||
348 | vpbroadcastd 4*7(RW), RT0; \ | ||
349 | vpxor RT0, d, b; \ | ||
350 | \ | ||
351 | vmovdqa RX0, c; \ | ||
352 | vmovdqa RY0, d; | ||
353 | |||
354 | #define outunpack_dec8(a,b,c,d) \ | ||
355 | vpbroadcastd 4*0(RW), RT0; \ | ||
356 | vpxor RT0, a, a; \ | ||
357 | \ | ||
358 | vpbroadcastd 4*1(RW), RT0; \ | ||
359 | vpxor RT0, b, b; \ | ||
360 | \ | ||
361 | vpbroadcastd 4*2(RW), RT0; \ | ||
362 | vpxor RT0, c, c; \ | ||
363 | \ | ||
364 | vpbroadcastd 4*3(RW), RT0; \ | ||
365 | vpxor RT0, d, d; | ||
366 | |||
367 | #define read_blocks16(a,b,c,d) \ | ||
368 | read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
369 | read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); | ||
370 | |||
371 | #define write_blocks16(a,b,c,d) \ | ||
372 | write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
373 | write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); | ||
374 | |||
375 | #define xor_blocks16(a,b,c,d) \ | ||
376 | xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
377 | xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); | ||
378 | |||
379 | #define inpack_enc16(a,b,c,d) \ | ||
380 | inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
381 | inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
382 | |||
383 | #define outunpack_enc16(a,b,c,d) \ | ||
384 | outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
385 | outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
386 | |||
387 | #define inpack_dec16(a,b,c,d) \ | ||
388 | inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
389 | inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
390 | |||
391 | #define outunpack_dec16(a,b,c,d) \ | ||
392 | outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ | ||
393 | outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); | ||
394 | |||
395 | .align 8 | ||
396 | __twofish_enc_blk16: | ||
397 | /* input: | ||
398 | * %rdi: ctx, CTX | ||
399 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext | ||
400 | * output: | ||
401 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext | ||
402 | */ | ||
403 | init_round_constants(); | ||
404 | |||
405 | read_blocks16(RA, RB, RC, RD); | ||
406 | inpack_enc16(RA, RB, RC, RD); | ||
407 | |||
408 | xorl RROUNDd, RROUNDd; | ||
409 | encrypt_cycle_first16(); | ||
410 | movl $2, RROUNDd; | ||
411 | |||
412 | .align 4 | ||
413 | .L__enc_loop: | ||
414 | encrypt_cycle16(); | ||
415 | |||
416 | addl $2, RROUNDd; | ||
417 | cmpl $14, RROUNDd; | ||
418 | jne .L__enc_loop; | ||
419 | |||
420 | encrypt_cycle_last16(); | ||
421 | |||
422 | outunpack_enc16(RA, RB, RC, RD); | ||
423 | write_blocks16(RA, RB, RC, RD); | ||
424 | |||
425 | ret; | ||
426 | ENDPROC(__twofish_enc_blk16) | ||
427 | |||
428 | .align 8 | ||
429 | __twofish_dec_blk16: | ||
430 | /* input: | ||
431 | * %rdi: ctx, CTX | ||
432 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext | ||
433 | * output: | ||
434 | * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext | ||
435 | */ | ||
436 | init_round_constants(); | ||
437 | |||
438 | read_blocks16(RA, RB, RC, RD); | ||
439 | inpack_dec16(RA, RB, RC, RD); | ||
440 | |||
441 | movl $14, RROUNDd; | ||
442 | decrypt_cycle_first16(); | ||
443 | movl $12, RROUNDd; | ||
444 | |||
445 | .align 4 | ||
446 | .L__dec_loop: | ||
447 | decrypt_cycle16(); | ||
448 | |||
449 | addl $-2, RROUNDd; | ||
450 | jnz .L__dec_loop; | ||
451 | |||
452 | decrypt_cycle_last16(); | ||
453 | |||
454 | outunpack_dec16(RA, RB, RC, RD); | ||
455 | write_blocks16(RA, RB, RC, RD); | ||
456 | |||
457 | ret; | ||
458 | ENDPROC(__twofish_dec_blk16) | ||
459 | |||
460 | ENTRY(twofish_ecb_enc_16way) | ||
461 | /* input: | ||
462 | * %rdi: ctx, CTX | ||
463 | * %rsi: dst | ||
464 | * %rdx: src | ||
465 | */ | ||
466 | |||
467 | vzeroupper; | ||
468 | pushq %r12; | ||
469 | |||
470 | load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
471 | |||
472 | call __twofish_enc_blk16; | ||
473 | |||
474 | store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
475 | |||
476 | popq %r12; | ||
477 | vzeroupper; | ||
478 | |||
479 | ret; | ||
480 | ENDPROC(twofish_ecb_enc_16way) | ||
481 | |||
482 | ENTRY(twofish_ecb_dec_16way) | ||
483 | /* input: | ||
484 | * %rdi: ctx, CTX | ||
485 | * %rsi: dst | ||
486 | * %rdx: src | ||
487 | */ | ||
488 | |||
489 | vzeroupper; | ||
490 | pushq %r12; | ||
491 | |||
492 | load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
493 | |||
494 | call __twofish_dec_blk16; | ||
495 | |||
496 | store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
497 | |||
498 | popq %r12; | ||
499 | vzeroupper; | ||
500 | |||
501 | ret; | ||
502 | ENDPROC(twofish_ecb_dec_16way) | ||
503 | |||
504 | ENTRY(twofish_cbc_dec_16way) | ||
505 | /* input: | ||
506 | * %rdi: ctx, CTX | ||
507 | * %rsi: dst | ||
508 | * %rdx: src | ||
509 | */ | ||
510 | |||
511 | vzeroupper; | ||
512 | pushq %r12; | ||
513 | |||
514 | load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
515 | |||
516 | call __twofish_dec_blk16; | ||
517 | |||
518 | store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, | ||
519 | RX0); | ||
520 | |||
521 | popq %r12; | ||
522 | vzeroupper; | ||
523 | |||
524 | ret; | ||
525 | ENDPROC(twofish_cbc_dec_16way) | ||
526 | |||
527 | ENTRY(twofish_ctr_16way) | ||
528 | /* input: | ||
529 | * %rdi: ctx, CTX | ||
530 | * %rsi: dst (16 blocks) | ||
531 | * %rdx: src (16 blocks) | ||
532 | * %rcx: iv (little endian, 128bit) | ||
533 | */ | ||
534 | |||
535 | vzeroupper; | ||
536 | pushq %r12; | ||
537 | |||
538 | load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, | ||
539 | RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, | ||
540 | RBYTE); | ||
541 | |||
542 | call __twofish_enc_blk16; | ||
543 | |||
544 | store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
545 | |||
546 | popq %r12; | ||
547 | vzeroupper; | ||
548 | |||
549 | ret; | ||
550 | ENDPROC(twofish_ctr_16way) | ||
551 | |||
552 | .align 8 | ||
553 | twofish_xts_crypt_16way: | ||
554 | /* input: | ||
555 | * %rdi: ctx, CTX | ||
556 | * %rsi: dst (16 blocks) | ||
557 | * %rdx: src (16 blocks) | ||
558 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
559 | * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 | ||
560 | */ | ||
561 | |||
562 | vzeroupper; | ||
563 | pushq %r12; | ||
564 | |||
565 | load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, | ||
566 | RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, | ||
567 | .Lxts_gf128mul_and_shl1_mask_0, | ||
568 | .Lxts_gf128mul_and_shl1_mask_1); | ||
569 | |||
570 | call *%r8; | ||
571 | |||
572 | store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); | ||
573 | |||
574 | popq %r12; | ||
575 | vzeroupper; | ||
576 | |||
577 | ret; | ||
578 | ENDPROC(twofish_xts_crypt_16way) | ||
579 | |||
580 | ENTRY(twofish_xts_enc_16way) | ||
581 | /* input: | ||
582 | * %rdi: ctx, CTX | ||
583 | * %rsi: dst (16 blocks) | ||
584 | * %rdx: src (16 blocks) | ||
585 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
586 | */ | ||
587 | leaq __twofish_enc_blk16, %r8; | ||
588 | jmp twofish_xts_crypt_16way; | ||
589 | ENDPROC(twofish_xts_enc_16way) | ||
590 | |||
591 | ENTRY(twofish_xts_dec_16way) | ||
592 | /* input: | ||
593 | * %rdi: ctx, CTX | ||
594 | * %rsi: dst (16 blocks) | ||
595 | * %rdx: src (16 blocks) | ||
596 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
597 | */ | ||
598 | leaq __twofish_dec_blk16, %r8; | ||
599 | jmp twofish_xts_crypt_16way; | ||
600 | ENDPROC(twofish_xts_dec_16way) | ||
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c new file mode 100644 index 000000000000..ce33b5be64ee --- /dev/null +++ b/arch/x86/crypto/twofish_avx2_glue.c | |||
@@ -0,0 +1,584 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2 assembler optimized version of Twofish | ||
3 | * | ||
4 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/twofish.h> | ||
20 | #include <crypto/lrw.h> | ||
21 | #include <crypto/xts.h> | ||
22 | #include <asm/xcr.h> | ||
23 | #include <asm/xsave.h> | ||
24 | #include <asm/crypto/twofish.h> | ||
25 | #include <asm/crypto/ablk_helper.h> | ||
26 | #include <asm/crypto/glue_helper.h> | ||
27 | #include <crypto/scatterwalk.h> | ||
28 | |||
29 | #define TF_AVX2_PARALLEL_BLOCKS 16 | ||
30 | |||
31 | /* 16-way AVX2 parallel cipher functions */ | ||
32 | asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst, | ||
33 | const u8 *src); | ||
34 | asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst, | ||
35 | const u8 *src); | ||
36 | asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); | ||
37 | |||
38 | asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src, | ||
39 | le128 *iv); | ||
40 | |||
41 | asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst, | ||
42 | const u8 *src, le128 *iv); | ||
43 | asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst, | ||
44 | const u8 *src, le128 *iv); | ||
45 | |||
46 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | ||
47 | const u8 *src) | ||
48 | { | ||
49 | __twofish_enc_blk_3way(ctx, dst, src, false); | ||
50 | } | ||
51 | |||
52 | static const struct common_glue_ctx twofish_enc = { | ||
53 | .num_funcs = 4, | ||
54 | .fpu_blocks_limit = 8, | ||
55 | |||
56 | .funcs = { { | ||
57 | .num_blocks = 16, | ||
58 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) } | ||
59 | }, { | ||
60 | .num_blocks = 8, | ||
61 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } | ||
62 | }, { | ||
63 | .num_blocks = 3, | ||
64 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } | ||
65 | }, { | ||
66 | .num_blocks = 1, | ||
67 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } | ||
68 | } } | ||
69 | }; | ||
70 | |||
71 | static const struct common_glue_ctx twofish_ctr = { | ||
72 | .num_funcs = 4, | ||
73 | .fpu_blocks_limit = 8, | ||
74 | |||
75 | .funcs = { { | ||
76 | .num_blocks = 16, | ||
77 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) } | ||
78 | }, { | ||
79 | .num_blocks = 8, | ||
80 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } | ||
81 | }, { | ||
82 | .num_blocks = 3, | ||
83 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } | ||
84 | }, { | ||
85 | .num_blocks = 1, | ||
86 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } | ||
87 | } } | ||
88 | }; | ||
89 | |||
90 | static const struct common_glue_ctx twofish_enc_xts = { | ||
91 | .num_funcs = 3, | ||
92 | .fpu_blocks_limit = 8, | ||
93 | |||
94 | .funcs = { { | ||
95 | .num_blocks = 16, | ||
96 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) } | ||
97 | }, { | ||
98 | .num_blocks = 8, | ||
99 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } | ||
100 | }, { | ||
101 | .num_blocks = 1, | ||
102 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } | ||
103 | } } | ||
104 | }; | ||
105 | |||
106 | static const struct common_glue_ctx twofish_dec = { | ||
107 | .num_funcs = 4, | ||
108 | .fpu_blocks_limit = 8, | ||
109 | |||
110 | .funcs = { { | ||
111 | .num_blocks = 16, | ||
112 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) } | ||
113 | }, { | ||
114 | .num_blocks = 8, | ||
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } | ||
116 | }, { | ||
117 | .num_blocks = 3, | ||
118 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } | ||
119 | }, { | ||
120 | .num_blocks = 1, | ||
121 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } | ||
122 | } } | ||
123 | }; | ||
124 | |||
125 | static const struct common_glue_ctx twofish_dec_cbc = { | ||
126 | .num_funcs = 4, | ||
127 | .fpu_blocks_limit = 8, | ||
128 | |||
129 | .funcs = { { | ||
130 | .num_blocks = 16, | ||
131 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) } | ||
132 | }, { | ||
133 | .num_blocks = 8, | ||
134 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } | ||
135 | }, { | ||
136 | .num_blocks = 3, | ||
137 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } | ||
138 | }, { | ||
139 | .num_blocks = 1, | ||
140 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } | ||
141 | } } | ||
142 | }; | ||
143 | |||
144 | static const struct common_glue_ctx twofish_dec_xts = { | ||
145 | .num_funcs = 3, | ||
146 | .fpu_blocks_limit = 8, | ||
147 | |||
148 | .funcs = { { | ||
149 | .num_blocks = 16, | ||
150 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) } | ||
151 | }, { | ||
152 | .num_blocks = 8, | ||
153 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } | ||
154 | }, { | ||
155 | .num_blocks = 1, | ||
156 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } | ||
157 | } } | ||
158 | }; | ||
159 | |||
160 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
161 | struct scatterlist *src, unsigned int nbytes) | ||
162 | { | ||
163 | return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); | ||
164 | } | ||
165 | |||
166 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
167 | struct scatterlist *src, unsigned int nbytes) | ||
168 | { | ||
169 | return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); | ||
170 | } | ||
171 | |||
172 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
173 | struct scatterlist *src, unsigned int nbytes) | ||
174 | { | ||
175 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, | ||
176 | dst, src, nbytes); | ||
177 | } | ||
178 | |||
179 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
180 | struct scatterlist *src, unsigned int nbytes) | ||
181 | { | ||
182 | return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, | ||
183 | nbytes); | ||
184 | } | ||
185 | |||
186 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
187 | struct scatterlist *src, unsigned int nbytes) | ||
188 | { | ||
189 | return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); | ||
190 | } | ||
191 | |||
192 | static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
193 | { | ||
194 | /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ | ||
195 | return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); | ||
196 | } | ||
197 | |||
198 | static inline void twofish_fpu_end(bool fpu_enabled) | ||
199 | { | ||
200 | glue_fpu_end(fpu_enabled); | ||
201 | } | ||
202 | |||
203 | struct crypt_priv { | ||
204 | struct twofish_ctx *ctx; | ||
205 | bool fpu_enabled; | ||
206 | }; | ||
207 | |||
208 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
209 | { | ||
210 | const unsigned int bsize = TF_BLOCK_SIZE; | ||
211 | struct crypt_priv *ctx = priv; | ||
212 | int i; | ||
213 | |||
214 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); | ||
215 | |||
216 | while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { | ||
217 | twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
218 | srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
219 | nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
220 | } | ||
221 | |||
222 | while (nbytes >= 8 * bsize) { | ||
223 | twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); | ||
224 | srcdst += bsize * 8; | ||
225 | nbytes -= bsize * 8; | ||
226 | } | ||
227 | |||
228 | while (nbytes >= 3 * bsize) { | ||
229 | twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); | ||
230 | srcdst += bsize * 3; | ||
231 | nbytes -= bsize * 3; | ||
232 | } | ||
233 | |||
234 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
235 | twofish_enc_blk(ctx->ctx, srcdst, srcdst); | ||
236 | } | ||
237 | |||
238 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
239 | { | ||
240 | const unsigned int bsize = TF_BLOCK_SIZE; | ||
241 | struct crypt_priv *ctx = priv; | ||
242 | int i; | ||
243 | |||
244 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); | ||
245 | |||
246 | while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { | ||
247 | twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
248 | srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
249 | nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; | ||
250 | } | ||
251 | |||
252 | while (nbytes >= 8 * bsize) { | ||
253 | twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); | ||
254 | srcdst += bsize * 8; | ||
255 | nbytes -= bsize * 8; | ||
256 | } | ||
257 | |||
258 | while (nbytes >= 3 * bsize) { | ||
259 | twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); | ||
260 | srcdst += bsize * 3; | ||
261 | nbytes -= bsize * 3; | ||
262 | } | ||
263 | |||
264 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
265 | twofish_dec_blk(ctx->ctx, srcdst, srcdst); | ||
266 | } | ||
267 | |||
268 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
269 | struct scatterlist *src, unsigned int nbytes) | ||
270 | { | ||
271 | struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
272 | be128 buf[TF_AVX2_PARALLEL_BLOCKS]; | ||
273 | struct crypt_priv crypt_ctx = { | ||
274 | .ctx = &ctx->twofish_ctx, | ||
275 | .fpu_enabled = false, | ||
276 | }; | ||
277 | struct lrw_crypt_req req = { | ||
278 | .tbuf = buf, | ||
279 | .tbuflen = sizeof(buf), | ||
280 | |||
281 | .table_ctx = &ctx->lrw_table, | ||
282 | .crypt_ctx = &crypt_ctx, | ||
283 | .crypt_fn = encrypt_callback, | ||
284 | }; | ||
285 | int ret; | ||
286 | |||
287 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
288 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
289 | twofish_fpu_end(crypt_ctx.fpu_enabled); | ||
290 | |||
291 | return ret; | ||
292 | } | ||
293 | |||
294 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
295 | struct scatterlist *src, unsigned int nbytes) | ||
296 | { | ||
297 | struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
298 | be128 buf[TF_AVX2_PARALLEL_BLOCKS]; | ||
299 | struct crypt_priv crypt_ctx = { | ||
300 | .ctx = &ctx->twofish_ctx, | ||
301 | .fpu_enabled = false, | ||
302 | }; | ||
303 | struct lrw_crypt_req req = { | ||
304 | .tbuf = buf, | ||
305 | .tbuflen = sizeof(buf), | ||
306 | |||
307 | .table_ctx = &ctx->lrw_table, | ||
308 | .crypt_ctx = &crypt_ctx, | ||
309 | .crypt_fn = decrypt_callback, | ||
310 | }; | ||
311 | int ret; | ||
312 | |||
313 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
314 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
315 | twofish_fpu_end(crypt_ctx.fpu_enabled); | ||
316 | |||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
321 | struct scatterlist *src, unsigned int nbytes) | ||
322 | { | ||
323 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
324 | |||
325 | return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, | ||
326 | XTS_TWEAK_CAST(twofish_enc_blk), | ||
327 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
328 | } | ||
329 | |||
330 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
331 | struct scatterlist *src, unsigned int nbytes) | ||
332 | { | ||
333 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
334 | |||
335 | return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, | ||
336 | XTS_TWEAK_CAST(twofish_enc_blk), | ||
337 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
338 | } | ||
339 | |||
340 | static struct crypto_alg tf_algs[10] = { { | ||
341 | .cra_name = "__ecb-twofish-avx2", | ||
342 | .cra_driver_name = "__driver-ecb-twofish-avx2", | ||
343 | .cra_priority = 0, | ||
344 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
345 | .cra_blocksize = TF_BLOCK_SIZE, | ||
346 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
347 | .cra_alignmask = 0, | ||
348 | .cra_type = &crypto_blkcipher_type, | ||
349 | .cra_module = THIS_MODULE, | ||
350 | .cra_u = { | ||
351 | .blkcipher = { | ||
352 | .min_keysize = TF_MIN_KEY_SIZE, | ||
353 | .max_keysize = TF_MAX_KEY_SIZE, | ||
354 | .setkey = twofish_setkey, | ||
355 | .encrypt = ecb_encrypt, | ||
356 | .decrypt = ecb_decrypt, | ||
357 | }, | ||
358 | }, | ||
359 | }, { | ||
360 | .cra_name = "__cbc-twofish-avx2", | ||
361 | .cra_driver_name = "__driver-cbc-twofish-avx2", | ||
362 | .cra_priority = 0, | ||
363 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
364 | .cra_blocksize = TF_BLOCK_SIZE, | ||
365 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
366 | .cra_alignmask = 0, | ||
367 | .cra_type = &crypto_blkcipher_type, | ||
368 | .cra_module = THIS_MODULE, | ||
369 | .cra_u = { | ||
370 | .blkcipher = { | ||
371 | .min_keysize = TF_MIN_KEY_SIZE, | ||
372 | .max_keysize = TF_MAX_KEY_SIZE, | ||
373 | .setkey = twofish_setkey, | ||
374 | .encrypt = cbc_encrypt, | ||
375 | .decrypt = cbc_decrypt, | ||
376 | }, | ||
377 | }, | ||
378 | }, { | ||
379 | .cra_name = "__ctr-twofish-avx2", | ||
380 | .cra_driver_name = "__driver-ctr-twofish-avx2", | ||
381 | .cra_priority = 0, | ||
382 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
383 | .cra_blocksize = 1, | ||
384 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
385 | .cra_alignmask = 0, | ||
386 | .cra_type = &crypto_blkcipher_type, | ||
387 | .cra_module = THIS_MODULE, | ||
388 | .cra_u = { | ||
389 | .blkcipher = { | ||
390 | .min_keysize = TF_MIN_KEY_SIZE, | ||
391 | .max_keysize = TF_MAX_KEY_SIZE, | ||
392 | .ivsize = TF_BLOCK_SIZE, | ||
393 | .setkey = twofish_setkey, | ||
394 | .encrypt = ctr_crypt, | ||
395 | .decrypt = ctr_crypt, | ||
396 | }, | ||
397 | }, | ||
398 | }, { | ||
399 | .cra_name = "__lrw-twofish-avx2", | ||
400 | .cra_driver_name = "__driver-lrw-twofish-avx2", | ||
401 | .cra_priority = 0, | ||
402 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
403 | .cra_blocksize = TF_BLOCK_SIZE, | ||
404 | .cra_ctxsize = sizeof(struct twofish_lrw_ctx), | ||
405 | .cra_alignmask = 0, | ||
406 | .cra_type = &crypto_blkcipher_type, | ||
407 | .cra_module = THIS_MODULE, | ||
408 | .cra_exit = lrw_twofish_exit_tfm, | ||
409 | .cra_u = { | ||
410 | .blkcipher = { | ||
411 | .min_keysize = TF_MIN_KEY_SIZE + | ||
412 | TF_BLOCK_SIZE, | ||
413 | .max_keysize = TF_MAX_KEY_SIZE + | ||
414 | TF_BLOCK_SIZE, | ||
415 | .ivsize = TF_BLOCK_SIZE, | ||
416 | .setkey = lrw_twofish_setkey, | ||
417 | .encrypt = lrw_encrypt, | ||
418 | .decrypt = lrw_decrypt, | ||
419 | }, | ||
420 | }, | ||
421 | }, { | ||
422 | .cra_name = "__xts-twofish-avx2", | ||
423 | .cra_driver_name = "__driver-xts-twofish-avx2", | ||
424 | .cra_priority = 0, | ||
425 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
426 | .cra_blocksize = TF_BLOCK_SIZE, | ||
427 | .cra_ctxsize = sizeof(struct twofish_xts_ctx), | ||
428 | .cra_alignmask = 0, | ||
429 | .cra_type = &crypto_blkcipher_type, | ||
430 | .cra_module = THIS_MODULE, | ||
431 | .cra_u = { | ||
432 | .blkcipher = { | ||
433 | .min_keysize = TF_MIN_KEY_SIZE * 2, | ||
434 | .max_keysize = TF_MAX_KEY_SIZE * 2, | ||
435 | .ivsize = TF_BLOCK_SIZE, | ||
436 | .setkey = xts_twofish_setkey, | ||
437 | .encrypt = xts_encrypt, | ||
438 | .decrypt = xts_decrypt, | ||
439 | }, | ||
440 | }, | ||
441 | }, { | ||
442 | .cra_name = "ecb(twofish)", | ||
443 | .cra_driver_name = "ecb-twofish-avx2", | ||
444 | .cra_priority = 500, | ||
445 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
446 | .cra_blocksize = TF_BLOCK_SIZE, | ||
447 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
448 | .cra_alignmask = 0, | ||
449 | .cra_type = &crypto_ablkcipher_type, | ||
450 | .cra_module = THIS_MODULE, | ||
451 | .cra_init = ablk_init, | ||
452 | .cra_exit = ablk_exit, | ||
453 | .cra_u = { | ||
454 | .ablkcipher = { | ||
455 | .min_keysize = TF_MIN_KEY_SIZE, | ||
456 | .max_keysize = TF_MAX_KEY_SIZE, | ||
457 | .setkey = ablk_set_key, | ||
458 | .encrypt = ablk_encrypt, | ||
459 | .decrypt = ablk_decrypt, | ||
460 | }, | ||
461 | }, | ||
462 | }, { | ||
463 | .cra_name = "cbc(twofish)", | ||
464 | .cra_driver_name = "cbc-twofish-avx2", | ||
465 | .cra_priority = 500, | ||
466 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
467 | .cra_blocksize = TF_BLOCK_SIZE, | ||
468 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
469 | .cra_alignmask = 0, | ||
470 | .cra_type = &crypto_ablkcipher_type, | ||
471 | .cra_module = THIS_MODULE, | ||
472 | .cra_init = ablk_init, | ||
473 | .cra_exit = ablk_exit, | ||
474 | .cra_u = { | ||
475 | .ablkcipher = { | ||
476 | .min_keysize = TF_MIN_KEY_SIZE, | ||
477 | .max_keysize = TF_MAX_KEY_SIZE, | ||
478 | .ivsize = TF_BLOCK_SIZE, | ||
479 | .setkey = ablk_set_key, | ||
480 | .encrypt = __ablk_encrypt, | ||
481 | .decrypt = ablk_decrypt, | ||
482 | }, | ||
483 | }, | ||
484 | }, { | ||
485 | .cra_name = "ctr(twofish)", | ||
486 | .cra_driver_name = "ctr-twofish-avx2", | ||
487 | .cra_priority = 500, | ||
488 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
489 | .cra_blocksize = 1, | ||
490 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
491 | .cra_alignmask = 0, | ||
492 | .cra_type = &crypto_ablkcipher_type, | ||
493 | .cra_module = THIS_MODULE, | ||
494 | .cra_init = ablk_init, | ||
495 | .cra_exit = ablk_exit, | ||
496 | .cra_u = { | ||
497 | .ablkcipher = { | ||
498 | .min_keysize = TF_MIN_KEY_SIZE, | ||
499 | .max_keysize = TF_MAX_KEY_SIZE, | ||
500 | .ivsize = TF_BLOCK_SIZE, | ||
501 | .setkey = ablk_set_key, | ||
502 | .encrypt = ablk_encrypt, | ||
503 | .decrypt = ablk_encrypt, | ||
504 | .geniv = "chainiv", | ||
505 | }, | ||
506 | }, | ||
507 | }, { | ||
508 | .cra_name = "lrw(twofish)", | ||
509 | .cra_driver_name = "lrw-twofish-avx2", | ||
510 | .cra_priority = 500, | ||
511 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
512 | .cra_blocksize = TF_BLOCK_SIZE, | ||
513 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
514 | .cra_alignmask = 0, | ||
515 | .cra_type = &crypto_ablkcipher_type, | ||
516 | .cra_module = THIS_MODULE, | ||
517 | .cra_init = ablk_init, | ||
518 | .cra_exit = ablk_exit, | ||
519 | .cra_u = { | ||
520 | .ablkcipher = { | ||
521 | .min_keysize = TF_MIN_KEY_SIZE + | ||
522 | TF_BLOCK_SIZE, | ||
523 | .max_keysize = TF_MAX_KEY_SIZE + | ||
524 | TF_BLOCK_SIZE, | ||
525 | .ivsize = TF_BLOCK_SIZE, | ||
526 | .setkey = ablk_set_key, | ||
527 | .encrypt = ablk_encrypt, | ||
528 | .decrypt = ablk_decrypt, | ||
529 | }, | ||
530 | }, | ||
531 | }, { | ||
532 | .cra_name = "xts(twofish)", | ||
533 | .cra_driver_name = "xts-twofish-avx2", | ||
534 | .cra_priority = 500, | ||
535 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
536 | .cra_blocksize = TF_BLOCK_SIZE, | ||
537 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
538 | .cra_alignmask = 0, | ||
539 | .cra_type = &crypto_ablkcipher_type, | ||
540 | .cra_module = THIS_MODULE, | ||
541 | .cra_init = ablk_init, | ||
542 | .cra_exit = ablk_exit, | ||
543 | .cra_u = { | ||
544 | .ablkcipher = { | ||
545 | .min_keysize = TF_MIN_KEY_SIZE * 2, | ||
546 | .max_keysize = TF_MAX_KEY_SIZE * 2, | ||
547 | .ivsize = TF_BLOCK_SIZE, | ||
548 | .setkey = ablk_set_key, | ||
549 | .encrypt = ablk_encrypt, | ||
550 | .decrypt = ablk_decrypt, | ||
551 | }, | ||
552 | }, | ||
553 | } }; | ||
554 | |||
555 | static int __init init(void) | ||
556 | { | ||
557 | u64 xcr0; | ||
558 | |||
559 | if (!cpu_has_avx2 || !cpu_has_osxsave) { | ||
560 | pr_info("AVX2 instructions are not detected.\n"); | ||
561 | return -ENODEV; | ||
562 | } | ||
563 | |||
564 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
565 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
566 | pr_info("AVX2 detected but unusable.\n"); | ||
567 | return -ENODEV; | ||
568 | } | ||
569 | |||
570 | return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); | ||
571 | } | ||
572 | |||
573 | static void __exit fini(void) | ||
574 | { | ||
575 | crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); | ||
576 | } | ||
577 | |||
578 | module_init(init); | ||
579 | module_exit(fini); | ||
580 | |||
581 | MODULE_LICENSE("GPL"); | ||
582 | MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized"); | ||
583 | MODULE_ALIAS("twofish"); | ||
584 | MODULE_ALIAS("twofish-asm"); | ||
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 94ac91d26e47..2047a562f6b3 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c | |||
@@ -4,6 +4,8 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
8 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or | 11 | * the Free Software Foundation; either version 2 of the License, or |
@@ -48,13 +50,26 @@ | |||
48 | /* 8-way parallel cipher functions */ | 50 | /* 8-way parallel cipher functions */ |
49 | asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, | 51 | asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, |
50 | const u8 *src); | 52 | const u8 *src); |
53 | EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way); | ||
54 | |||
51 | asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, | 55 | asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
52 | const u8 *src); | 56 | const u8 *src); |
57 | EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way); | ||
53 | 58 | ||
54 | asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, | 59 | asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
55 | const u8 *src); | 60 | const u8 *src); |
61 | EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way); | ||
62 | |||
56 | asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, | 63 | asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, |
57 | const u8 *src, le128 *iv); | 64 | const u8 *src, le128 *iv); |
65 | EXPORT_SYMBOL_GPL(twofish_ctr_8way); | ||
66 | |||
67 | asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, | ||
68 | const u8 *src, le128 *iv); | ||
69 | EXPORT_SYMBOL_GPL(twofish_xts_enc_8way); | ||
70 | asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, | ||
71 | const u8 *src, le128 *iv); | ||
72 | EXPORT_SYMBOL_GPL(twofish_xts_dec_8way); | ||
58 | 73 | ||
59 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | 74 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, |
60 | const u8 *src) | 75 | const u8 *src) |
@@ -62,6 +77,20 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | |||
62 | __twofish_enc_blk_3way(ctx, dst, src, false); | 77 | __twofish_enc_blk_3way(ctx, dst, src, false); |
63 | } | 78 | } |
64 | 79 | ||
80 | void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
81 | { | ||
82 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
83 | GLUE_FUNC_CAST(twofish_enc_blk)); | ||
84 | } | ||
85 | EXPORT_SYMBOL_GPL(twofish_xts_enc); | ||
86 | |||
87 | void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | ||
88 | { | ||
89 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | ||
90 | GLUE_FUNC_CAST(twofish_dec_blk)); | ||
91 | } | ||
92 | EXPORT_SYMBOL_GPL(twofish_xts_dec); | ||
93 | |||
65 | 94 | ||
66 | static const struct common_glue_ctx twofish_enc = { | 95 | static const struct common_glue_ctx twofish_enc = { |
67 | .num_funcs = 3, | 96 | .num_funcs = 3, |
@@ -95,6 +124,19 @@ static const struct common_glue_ctx twofish_ctr = { | |||
95 | } } | 124 | } } |
96 | }; | 125 | }; |
97 | 126 | ||
127 | static const struct common_glue_ctx twofish_enc_xts = { | ||
128 | .num_funcs = 2, | ||
129 | .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, | ||
130 | |||
131 | .funcs = { { | ||
132 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, | ||
133 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } | ||
134 | }, { | ||
135 | .num_blocks = 1, | ||
136 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } | ||
137 | } } | ||
138 | }; | ||
139 | |||
98 | static const struct common_glue_ctx twofish_dec = { | 140 | static const struct common_glue_ctx twofish_dec = { |
99 | .num_funcs = 3, | 141 | .num_funcs = 3, |
100 | .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, | 142 | .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, |
@@ -127,6 +169,19 @@ static const struct common_glue_ctx twofish_dec_cbc = { | |||
127 | } } | 169 | } } |
128 | }; | 170 | }; |
129 | 171 | ||
172 | static const struct common_glue_ctx twofish_dec_xts = { | ||
173 | .num_funcs = 2, | ||
174 | .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, | ||
175 | |||
176 | .funcs = { { | ||
177 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, | ||
178 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } | ||
179 | }, { | ||
180 | .num_blocks = 1, | ||
181 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } | ||
182 | } } | ||
183 | }; | ||
184 | |||
130 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 185 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
131 | struct scatterlist *src, unsigned int nbytes) | 186 | struct scatterlist *src, unsigned int nbytes) |
132 | { | 187 | { |
@@ -275,54 +330,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
275 | struct scatterlist *src, unsigned int nbytes) | 330 | struct scatterlist *src, unsigned int nbytes) |
276 | { | 331 | { |
277 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 332 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
278 | be128 buf[TWOFISH_PARALLEL_BLOCKS]; | ||
279 | struct crypt_priv crypt_ctx = { | ||
280 | .ctx = &ctx->crypt_ctx, | ||
281 | .fpu_enabled = false, | ||
282 | }; | ||
283 | struct xts_crypt_req req = { | ||
284 | .tbuf = buf, | ||
285 | .tbuflen = sizeof(buf), | ||
286 | 333 | ||
287 | .tweak_ctx = &ctx->tweak_ctx, | 334 | return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, |
288 | .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), | 335 | XTS_TWEAK_CAST(twofish_enc_blk), |
289 | .crypt_ctx = &crypt_ctx, | 336 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
290 | .crypt_fn = encrypt_callback, | ||
291 | }; | ||
292 | int ret; | ||
293 | |||
294 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
295 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
296 | twofish_fpu_end(crypt_ctx.fpu_enabled); | ||
297 | |||
298 | return ret; | ||
299 | } | 337 | } |
300 | 338 | ||
301 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 339 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
302 | struct scatterlist *src, unsigned int nbytes) | 340 | struct scatterlist *src, unsigned int nbytes) |
303 | { | 341 | { |
304 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 342 | struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
305 | be128 buf[TWOFISH_PARALLEL_BLOCKS]; | ||
306 | struct crypt_priv crypt_ctx = { | ||
307 | .ctx = &ctx->crypt_ctx, | ||
308 | .fpu_enabled = false, | ||
309 | }; | ||
310 | struct xts_crypt_req req = { | ||
311 | .tbuf = buf, | ||
312 | .tbuflen = sizeof(buf), | ||
313 | |||
314 | .tweak_ctx = &ctx->tweak_ctx, | ||
315 | .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), | ||
316 | .crypt_ctx = &crypt_ctx, | ||
317 | .crypt_fn = decrypt_callback, | ||
318 | }; | ||
319 | int ret; | ||
320 | 343 | ||
321 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | 344 | return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, |
322 | ret = xts_crypt(desc, dst, src, nbytes, &req); | 345 | XTS_TWEAK_CAST(twofish_enc_blk), |
323 | twofish_fpu_end(crypt_ctx.fpu_enabled); | 346 | &ctx->tweak_ctx, &ctx->crypt_ctx); |
324 | |||
325 | return ret; | ||
326 | } | 347 | } |
327 | 348 | ||
328 | static struct crypto_alg twofish_algs[10] = { { | 349 | static struct crypto_alg twofish_algs[10] = { { |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8010ebc5705f..e99ac27f95b2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -293,6 +293,7 @@ extern const char * const x86_power_flags[32]; | |||
293 | #define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) | 293 | #define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) |
294 | #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) | 294 | #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) |
295 | #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) | 295 | #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) |
296 | #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) | ||
296 | #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) | 297 | #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) |
297 | #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) | 298 | #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) |
298 | #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) | 299 | #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) |
diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h new file mode 100644 index 000000000000..f097b2face10 --- /dev/null +++ b/arch/x86/include/asm/crypto/blowfish.h | |||
@@ -0,0 +1,43 @@ | |||
1 | #ifndef ASM_X86_BLOWFISH_H | ||
2 | #define ASM_X86_BLOWFISH_H | ||
3 | |||
4 | #include <linux/crypto.h> | ||
5 | #include <crypto/blowfish.h> | ||
6 | |||
7 | #define BF_PARALLEL_BLOCKS 4 | ||
8 | |||
9 | /* regular block cipher functions */ | ||
10 | asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, | ||
11 | bool xor); | ||
12 | asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); | ||
13 | |||
14 | /* 4-way parallel cipher functions */ | ||
15 | asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | ||
16 | const u8 *src, bool xor); | ||
17 | asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, | ||
18 | const u8 *src); | ||
19 | |||
20 | static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) | ||
21 | { | ||
22 | __blowfish_enc_blk(ctx, dst, src, false); | ||
23 | } | ||
24 | |||
25 | static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, | ||
26 | const u8 *src) | ||
27 | { | ||
28 | __blowfish_enc_blk(ctx, dst, src, true); | ||
29 | } | ||
30 | |||
31 | static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, | ||
32 | const u8 *src) | ||
33 | { | ||
34 | __blowfish_enc_blk_4way(ctx, dst, src, false); | ||
35 | } | ||
36 | |||
37 | static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, | ||
38 | const u8 *src) | ||
39 | { | ||
40 | __blowfish_enc_blk_4way(ctx, dst, src, true); | ||
41 | } | ||
42 | |||
43 | #endif | ||
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index 98038add801e..bb93333d9200 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h | |||
@@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, | |||
48 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, | 48 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, |
49 | const u8 *src); | 49 | const u8 *src); |
50 | 50 | ||
51 | /* 16-way parallel cipher functions (avx/aes-ni) */ | ||
52 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
53 | const u8 *src); | ||
54 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
55 | const u8 *src); | ||
56 | |||
57 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
58 | const u8 *src); | ||
59 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | ||
60 | const u8 *src, le128 *iv); | ||
61 | |||
62 | asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
63 | const u8 *src, le128 *iv); | ||
64 | asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
65 | const u8 *src, le128 *iv); | ||
66 | |||
51 | static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, | 67 | static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, |
52 | const u8 *src) | 68 | const u8 *src) |
53 | { | 69 | { |
@@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, | |||
79 | extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, | 95 | extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, |
80 | le128 *iv); | 96 | le128 *iv); |
81 | 97 | ||
98 | extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
99 | extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
100 | |||
82 | #endif /* ASM_X86_CAMELLIA_H */ | 101 | #endif /* ASM_X86_CAMELLIA_H */ |
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index e2d65b061d27..1eef55596e82 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h | |||
@@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); | |||
14 | typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); | 14 | typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); |
15 | typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, | 15 | typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, |
16 | le128 *iv); | 16 | le128 *iv); |
17 | typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src, | ||
18 | le128 *iv); | ||
17 | 19 | ||
18 | #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) | 20 | #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) |
19 | #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) | 21 | #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) |
20 | #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) | 22 | #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) |
23 | #define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn)) | ||
21 | 24 | ||
22 | struct common_glue_func_entry { | 25 | struct common_glue_func_entry { |
23 | unsigned int num_blocks; /* number of blocks that @fn will process */ | 26 | unsigned int num_blocks; /* number of blocks that @fn will process */ |
@@ -25,6 +28,7 @@ struct common_glue_func_entry { | |||
25 | common_glue_func_t ecb; | 28 | common_glue_func_t ecb; |
26 | common_glue_cbc_func_t cbc; | 29 | common_glue_cbc_func_t cbc; |
27 | common_glue_ctr_func_t ctr; | 30 | common_glue_ctr_func_t ctr; |
31 | common_glue_xts_func_t xts; | ||
28 | } fn_u; | 32 | } fn_u; |
29 | }; | 33 | }; |
30 | 34 | ||
@@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i) | |||
96 | i->b = cpu_to_le64(b); | 100 | i->b = cpu_to_le64(b); |
97 | } | 101 | } |
98 | 102 | ||
103 | static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src) | ||
104 | { | ||
105 | u64 a = le64_to_cpu(src->a); | ||
106 | u64 b = le64_to_cpu(src->b); | ||
107 | u64 _tt = ((s64)a >> 63) & 0x87; | ||
108 | |||
109 | dst->a = cpu_to_le64((a << 1) ^ (b >> 63)); | ||
110 | dst->b = cpu_to_le64((b << 1) ^ _tt); | ||
111 | } | ||
112 | |||
99 | extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, | 113 | extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, |
100 | struct blkcipher_desc *desc, | 114 | struct blkcipher_desc *desc, |
101 | struct scatterlist *dst, | 115 | struct scatterlist *dst, |
@@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | |||
118 | struct scatterlist *dst, | 132 | struct scatterlist *dst, |
119 | struct scatterlist *src, unsigned int nbytes); | 133 | struct scatterlist *src, unsigned int nbytes); |
120 | 134 | ||
135 | extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, | ||
136 | struct blkcipher_desc *desc, | ||
137 | struct scatterlist *dst, | ||
138 | struct scatterlist *src, unsigned int nbytes, | ||
139 | common_glue_func_t tweak_fn, void *tweak_ctx, | ||
140 | void *crypt_ctx); | ||
141 | |||
142 | extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, | ||
143 | le128 *iv, common_glue_func_t fn); | ||
144 | |||
121 | #endif /* _CRYPTO_GLUE_HELPER_H */ | 145 | #endif /* _CRYPTO_GLUE_HELPER_H */ |
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 0da1d3e2a55c..33c2b8a435da 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h | |||
@@ -6,6 +6,16 @@ | |||
6 | 6 | ||
7 | #define SERPENT_PARALLEL_BLOCKS 8 | 7 | #define SERPENT_PARALLEL_BLOCKS 8 |
8 | 8 | ||
9 | struct serpent_lrw_ctx { | ||
10 | struct lrw_table_ctx lrw_table; | ||
11 | struct serpent_ctx serpent_ctx; | ||
12 | }; | ||
13 | |||
14 | struct serpent_xts_ctx { | ||
15 | struct serpent_ctx tweak_ctx; | ||
16 | struct serpent_ctx crypt_ctx; | ||
17 | }; | ||
18 | |||
9 | asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 19 | asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
10 | const u8 *src); | 20 | const u8 *src); |
11 | asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 21 | asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
@@ -16,4 +26,23 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, | |||
16 | asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 26 | asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
17 | const u8 *src, le128 *iv); | 27 | const u8 *src, le128 *iv); |
18 | 28 | ||
29 | asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
30 | const u8 *src, le128 *iv); | ||
31 | asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, | ||
32 | const u8 *src, le128 *iv); | ||
33 | |||
34 | extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, | ||
35 | le128 *iv); | ||
36 | |||
37 | extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
38 | extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
39 | |||
40 | extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
41 | unsigned int keylen); | ||
42 | |||
43 | extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm); | ||
44 | |||
45 | extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
46 | unsigned int keylen); | ||
47 | |||
19 | #endif | 48 | #endif |
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 878c51ceebb5..e655c6029b45 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h | |||
@@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | |||
28 | asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, | 28 | asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, |
29 | const u8 *src); | 29 | const u8 *src); |
30 | 30 | ||
31 | /* 8-way parallel cipher functions */ | ||
32 | asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, | ||
33 | const u8 *src); | ||
34 | asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, | ||
35 | const u8 *src); | ||
36 | asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, | ||
37 | const u8 *src); | ||
38 | asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, | ||
39 | const u8 *src, le128 *iv); | ||
40 | asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, | ||
41 | const u8 *src, le128 *iv); | ||
42 | asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, | ||
43 | const u8 *src, le128 *iv); | ||
44 | |||
31 | /* helpers from twofish_x86_64-3way module */ | 45 | /* helpers from twofish_x86_64-3way module */ |
32 | extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); | 46 | extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); |
33 | extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, | 47 | extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, |
@@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); | |||
43 | extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, | 57 | extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, |
44 | unsigned int keylen); | 58 | unsigned int keylen); |
45 | 59 | ||
60 | /* helpers from twofish-avx module */ | ||
61 | extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
62 | extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
63 | |||
46 | #endif /* ASM_X86_TWOFISH_H */ | 64 | #endif /* ASM_X86_TWOFISH_H */ |