diff options
author | Martin Willi <martin@strongswan.org> | 2015-07-16 13:14:01 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2015-07-17 09:20:24 -0400 |
commit | c9320b6dcb89658a5e53b4f8e31f4c2ee810ec2d (patch) | |
tree | e95f632a25b80944460e04ce8a7c646342603d9c | |
parent | 31d7247da57226e847f0f102a10c27c0722c429b (diff) |
crypto: chacha20 - Add a SSSE3 SIMD variant for x86_64
Implements an x86_64 assembler driver for the ChaCha20 stream cipher. This
single block variant works on a single state matrix using SSE instructions.
It requires SSSE3 due the use of pshufb for efficient 8/16-bit rotate
operations.
For large messages, throughput increases by ~65% compared to
chacha20-generic:
testing speed of chacha20 (chacha20-generic) encryption
test 0 (256 bit key, 16 byte blocks): 45089207 operations in 10 seconds (721427312 bytes)
test 1 (256 bit key, 64 byte blocks): 43839521 operations in 10 seconds (2805729344 bytes)
test 2 (256 bit key, 256 byte blocks): 12702056 operations in 10 seconds (3251726336 bytes)
test 3 (256 bit key, 1024 byte blocks): 3371173 operations in 10 seconds (3452081152 bytes)
test 4 (256 bit key, 8192 byte blocks): 422468 operations in 10 seconds (3460857856 bytes)
testing speed of chacha20 (chacha20-simd) encryption
test 0 (256 bit key, 16 byte blocks): 43141886 operations in 10 seconds (690270176 bytes)
test 1 (256 bit key, 64 byte blocks): 46845874 operations in 10 seconds (2998135936 bytes)
test 2 (256 bit key, 256 byte blocks): 18458512 operations in 10 seconds (4725379072 bytes)
test 3 (256 bit key, 1024 byte blocks): 5360533 operations in 10 seconds (5489185792 bytes)
test 4 (256 bit key, 8192 byte blocks): 692846 operations in 10 seconds (5675794432 bytes)
Benchmark results from a Core i5-4670T.
Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/crypto/chacha20-ssse3-x86_64.S | 142 | ||||
-rw-r--r-- | arch/x86/crypto/chacha20_glue.c | 123 | ||||
-rw-r--r-- | crypto/Kconfig | 15 |
4 files changed, 282 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5a4a089e8b1f..b09e9a4cea3e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o | |||
20 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o | 20 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o |
21 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o | 21 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o |
22 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o | 22 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o |
23 | obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o | ||
23 | obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o | 24 | obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o |
24 | obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o | 25 | obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o |
25 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o | 26 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o |
@@ -60,6 +61,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o | |||
60 | twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o | 61 | twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o |
61 | twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o | 62 | twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o |
62 | salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o | 63 | salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o |
64 | chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o | ||
63 | serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o | 65 | serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o |
64 | 66 | ||
65 | ifeq ($(avx_supported),yes) | 67 | ifeq ($(avx_supported),yes) |
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S new file mode 100644 index 000000000000..1b97ad074cef --- /dev/null +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S | |||
@@ -0,0 +1,142 @@ | |||
1 | /* | ||
2 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions | ||
3 | * | ||
4 | * Copyright (C) 2015 Martin Willi | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | |||
14 | .data | ||
15 | .align 16 | ||
16 | |||
17 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 | ||
18 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 | ||
19 | |||
20 | .text | ||
21 | |||
22 | ENTRY(chacha20_block_xor_ssse3) | ||
23 | # %rdi: Input state matrix, s | ||
24 | # %rsi: 1 data block output, o | ||
25 | # %rdx: 1 data block input, i | ||
26 | |||
27 | # This function encrypts one ChaCha20 block by loading the state matrix | ||
28 | # in four SSE registers. It performs matrix operation on four words in | ||
29 | # parallel, but requireds shuffling to rearrange the words after each | ||
30 | # round. 8/16-bit word rotation is done with the slightly better | ||
31 | # performing SSSE3 byte shuffling, 7/12-bit word rotation uses | ||
32 | # traditional shift+OR. | ||
33 | |||
34 | # x0..3 = s0..3 | ||
35 | movdqa 0x00(%rdi),%xmm0 | ||
36 | movdqa 0x10(%rdi),%xmm1 | ||
37 | movdqa 0x20(%rdi),%xmm2 | ||
38 | movdqa 0x30(%rdi),%xmm3 | ||
39 | movdqa %xmm0,%xmm8 | ||
40 | movdqa %xmm1,%xmm9 | ||
41 | movdqa %xmm2,%xmm10 | ||
42 | movdqa %xmm3,%xmm11 | ||
43 | |||
44 | movdqa ROT8(%rip),%xmm4 | ||
45 | movdqa ROT16(%rip),%xmm5 | ||
46 | |||
47 | mov $10,%ecx | ||
48 | |||
49 | .Ldoubleround: | ||
50 | |||
51 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) | ||
52 | paddd %xmm1,%xmm0 | ||
53 | pxor %xmm0,%xmm3 | ||
54 | pshufb %xmm5,%xmm3 | ||
55 | |||
56 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) | ||
57 | paddd %xmm3,%xmm2 | ||
58 | pxor %xmm2,%xmm1 | ||
59 | movdqa %xmm1,%xmm6 | ||
60 | pslld $12,%xmm6 | ||
61 | psrld $20,%xmm1 | ||
62 | por %xmm6,%xmm1 | ||
63 | |||
64 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) | ||
65 | paddd %xmm1,%xmm0 | ||
66 | pxor %xmm0,%xmm3 | ||
67 | pshufb %xmm4,%xmm3 | ||
68 | |||
69 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) | ||
70 | paddd %xmm3,%xmm2 | ||
71 | pxor %xmm2,%xmm1 | ||
72 | movdqa %xmm1,%xmm7 | ||
73 | pslld $7,%xmm7 | ||
74 | psrld $25,%xmm1 | ||
75 | por %xmm7,%xmm1 | ||
76 | |||
77 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) | ||
78 | pshufd $0x39,%xmm1,%xmm1 | ||
79 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | ||
80 | pshufd $0x4e,%xmm2,%xmm2 | ||
81 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) | ||
82 | pshufd $0x93,%xmm3,%xmm3 | ||
83 | |||
84 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) | ||
85 | paddd %xmm1,%xmm0 | ||
86 | pxor %xmm0,%xmm3 | ||
87 | pshufb %xmm5,%xmm3 | ||
88 | |||
89 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) | ||
90 | paddd %xmm3,%xmm2 | ||
91 | pxor %xmm2,%xmm1 | ||
92 | movdqa %xmm1,%xmm6 | ||
93 | pslld $12,%xmm6 | ||
94 | psrld $20,%xmm1 | ||
95 | por %xmm6,%xmm1 | ||
96 | |||
97 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) | ||
98 | paddd %xmm1,%xmm0 | ||
99 | pxor %xmm0,%xmm3 | ||
100 | pshufb %xmm4,%xmm3 | ||
101 | |||
102 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) | ||
103 | paddd %xmm3,%xmm2 | ||
104 | pxor %xmm2,%xmm1 | ||
105 | movdqa %xmm1,%xmm7 | ||
106 | pslld $7,%xmm7 | ||
107 | psrld $25,%xmm1 | ||
108 | por %xmm7,%xmm1 | ||
109 | |||
110 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) | ||
111 | pshufd $0x93,%xmm1,%xmm1 | ||
112 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | ||
113 | pshufd $0x4e,%xmm2,%xmm2 | ||
114 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) | ||
115 | pshufd $0x39,%xmm3,%xmm3 | ||
116 | |||
117 | dec %ecx | ||
118 | jnz .Ldoubleround | ||
119 | |||
120 | # o0 = i0 ^ (x0 + s0) | ||
121 | movdqu 0x00(%rdx),%xmm4 | ||
122 | paddd %xmm8,%xmm0 | ||
123 | pxor %xmm4,%xmm0 | ||
124 | movdqu %xmm0,0x00(%rsi) | ||
125 | # o1 = i1 ^ (x1 + s1) | ||
126 | movdqu 0x10(%rdx),%xmm5 | ||
127 | paddd %xmm9,%xmm1 | ||
128 | pxor %xmm5,%xmm1 | ||
129 | movdqu %xmm1,0x10(%rsi) | ||
130 | # o2 = i2 ^ (x2 + s2) | ||
131 | movdqu 0x20(%rdx),%xmm6 | ||
132 | paddd %xmm10,%xmm2 | ||
133 | pxor %xmm6,%xmm2 | ||
134 | movdqu %xmm2,0x20(%rsi) | ||
135 | # o3 = i3 ^ (x3 + s3) | ||
136 | movdqu 0x30(%rdx),%xmm7 | ||
137 | paddd %xmm11,%xmm3 | ||
138 | pxor %xmm7,%xmm3 | ||
139 | movdqu %xmm3,0x30(%rsi) | ||
140 | |||
141 | ret | ||
142 | ENDPROC(chacha20_block_xor_ssse3) | ||
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c new file mode 100644 index 000000000000..250de401d28f --- /dev/null +++ b/arch/x86/crypto/chacha20_glue.c | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code | ||
3 | * | ||
4 | * Copyright (C) 2015 Martin Willi | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <crypto/algapi.h> | ||
13 | #include <crypto/chacha20.h> | ||
14 | #include <linux/crypto.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/fpu/api.h> | ||
18 | #include <asm/simd.h> | ||
19 | |||
20 | #define CHACHA20_STATE_ALIGN 16 | ||
21 | |||
22 | asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); | ||
23 | |||
24 | static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, | ||
25 | unsigned int bytes) | ||
26 | { | ||
27 | u8 buf[CHACHA20_BLOCK_SIZE]; | ||
28 | |||
29 | while (bytes >= CHACHA20_BLOCK_SIZE) { | ||
30 | chacha20_block_xor_ssse3(state, dst, src); | ||
31 | bytes -= CHACHA20_BLOCK_SIZE; | ||
32 | src += CHACHA20_BLOCK_SIZE; | ||
33 | dst += CHACHA20_BLOCK_SIZE; | ||
34 | state[12]++; | ||
35 | } | ||
36 | if (bytes) { | ||
37 | memcpy(buf, src, bytes); | ||
38 | chacha20_block_xor_ssse3(state, buf, buf); | ||
39 | memcpy(dst, buf, bytes); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
44 | struct scatterlist *src, unsigned int nbytes) | ||
45 | { | ||
46 | u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1]; | ||
47 | struct blkcipher_walk walk; | ||
48 | int err; | ||
49 | |||
50 | if (!may_use_simd()) | ||
51 | return crypto_chacha20_crypt(desc, dst, src, nbytes); | ||
52 | |||
53 | state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); | ||
54 | |||
55 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
56 | err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); | ||
57 | |||
58 | crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); | ||
59 | |||
60 | kernel_fpu_begin(); | ||
61 | |||
62 | while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { | ||
63 | chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, | ||
64 | rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); | ||
65 | err = blkcipher_walk_done(desc, &walk, | ||
66 | walk.nbytes % CHACHA20_BLOCK_SIZE); | ||
67 | } | ||
68 | |||
69 | if (walk.nbytes) { | ||
70 | chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, | ||
71 | walk.nbytes); | ||
72 | err = blkcipher_walk_done(desc, &walk, 0); | ||
73 | } | ||
74 | |||
75 | kernel_fpu_end(); | ||
76 | |||
77 | return err; | ||
78 | } | ||
79 | |||
80 | static struct crypto_alg alg = { | ||
81 | .cra_name = "chacha20", | ||
82 | .cra_driver_name = "chacha20-simd", | ||
83 | .cra_priority = 300, | ||
84 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
85 | .cra_blocksize = 1, | ||
86 | .cra_type = &crypto_blkcipher_type, | ||
87 | .cra_ctxsize = sizeof(struct chacha20_ctx), | ||
88 | .cra_alignmask = sizeof(u32) - 1, | ||
89 | .cra_module = THIS_MODULE, | ||
90 | .cra_u = { | ||
91 | .blkcipher = { | ||
92 | .min_keysize = CHACHA20_KEY_SIZE, | ||
93 | .max_keysize = CHACHA20_KEY_SIZE, | ||
94 | .ivsize = CHACHA20_IV_SIZE, | ||
95 | .geniv = "seqiv", | ||
96 | .setkey = crypto_chacha20_setkey, | ||
97 | .encrypt = chacha20_simd, | ||
98 | .decrypt = chacha20_simd, | ||
99 | }, | ||
100 | }, | ||
101 | }; | ||
102 | |||
103 | static int __init chacha20_simd_mod_init(void) | ||
104 | { | ||
105 | if (!cpu_has_ssse3) | ||
106 | return -ENODEV; | ||
107 | |||
108 | return crypto_register_alg(&alg); | ||
109 | } | ||
110 | |||
111 | static void __exit chacha20_simd_mod_fini(void) | ||
112 | { | ||
113 | crypto_unregister_alg(&alg); | ||
114 | } | ||
115 | |||
116 | module_init(chacha20_simd_mod_init); | ||
117 | module_exit(chacha20_simd_mod_fini); | ||
118 | |||
119 | MODULE_LICENSE("GPL"); | ||
120 | MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); | ||
121 | MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); | ||
122 | MODULE_ALIAS_CRYPTO("chacha20"); | ||
123 | MODULE_ALIAS_CRYPTO("chacha20-simd"); | ||
diff --git a/crypto/Kconfig b/crypto/Kconfig index b4cfc5754033..8f24185ee0a7 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig | |||
@@ -1213,6 +1213,21 @@ config CRYPTO_CHACHA20 | |||
1213 | See also: | 1213 | See also: |
1214 | <http://cr.yp.to/chacha/chacha-20080128.pdf> | 1214 | <http://cr.yp.to/chacha/chacha-20080128.pdf> |
1215 | 1215 | ||
1216 | config CRYPTO_CHACHA20_X86_64 | ||
1217 | tristate "ChaCha20 cipher algorithm (x86_64/SSSE3)" | ||
1218 | depends on X86 && 64BIT | ||
1219 | select CRYPTO_BLKCIPHER | ||
1220 | select CRYPTO_CHACHA20 | ||
1221 | help | ||
1222 | ChaCha20 cipher algorithm, RFC7539. | ||
1223 | |||
1224 | ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J. | ||
1225 | Bernstein and further specified in RFC7539 for use in IETF protocols. | ||
1226 | This is the x86_64 assembler implementation using SIMD instructions. | ||
1227 | |||
1228 | See also: | ||
1229 | <http://cr.yp.to/chacha/chacha-20080128.pdf> | ||
1230 | |||
1216 | config CRYPTO_SEED | 1231 | config CRYPTO_SEED |
1217 | tristate "SEED cipher algorithm" | 1232 | tristate "SEED cipher algorithm" |
1218 | select CRYPTO_ALGAPI | 1233 | select CRYPTO_ALGAPI |