diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2012-10-20 08:06:41 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2012-10-24 09:10:54 -0400 |
commit | cba1cce05498d55f363c28cd2512368e95605518 (patch) | |
tree | a8728af2175a89a598b865c89dc2e3f08313c813 /arch/x86/crypto/cast6_avx_glue.c | |
parent | 58990986f1cba40c23c0c10592ace08616de3ffa (diff) |
crypto: cast6/avx - avoid using temporary stack buffers
Introduce new assembler functions to avoid use temporary stack buffers in
glue code. This also allows use of vector instructions for xoring output
in CTR and CBC modes and construction of IVs for CTR mode.
ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~2%.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/cast6_avx_glue.c')
-rw-r--r-- | arch/x86/crypto/cast6_avx_glue.c | 71 |
1 files changed, 13 insertions, 58 deletions
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 1dfd33b5b4fb..92f7ca24790a 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c | |||
@@ -40,43 +40,15 @@ | |||
40 | 40 | ||
41 | #define CAST6_PARALLEL_BLOCKS 8 | 41 | #define CAST6_PARALLEL_BLOCKS 8 |
42 | 42 | ||
43 | asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, | 43 | asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst, |
44 | const u8 *src, bool xor); | 44 | const u8 *src); |
45 | asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, | 45 | asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst, |
46 | const u8 *src); | 46 | const u8 *src); |
47 | 47 | ||
48 | static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, | 48 | asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, |
49 | const u8 *src) | 49 | const u8 *src); |
50 | { | 50 | asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, |
51 | __cast6_enc_blk_8way(ctx, dst, src, false); | 51 | le128 *iv); |
52 | } | ||
53 | |||
54 | static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst, | ||
55 | const u8 *src) | ||
56 | { | ||
57 | __cast6_enc_blk_8way(ctx, dst, src, true); | ||
58 | } | ||
59 | |||
60 | static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst, | ||
61 | const u8 *src) | ||
62 | { | ||
63 | cast6_dec_blk_8way(ctx, dst, src); | ||
64 | } | ||
65 | |||
66 | |||
67 | static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) | ||
68 | { | ||
69 | u128 ivs[CAST6_PARALLEL_BLOCKS - 1]; | ||
70 | unsigned int j; | ||
71 | |||
72 | for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) | ||
73 | ivs[j] = src[j]; | ||
74 | |||
75 | cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); | ||
76 | |||
77 | for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) | ||
78 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); | ||
79 | } | ||
80 | 52 | ||
81 | static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 53 | static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
82 | { | 54 | { |
@@ -89,30 +61,13 @@ static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) | |||
89 | u128_xor(dst, src, (u128 *)&ctrblk); | 61 | u128_xor(dst, src, (u128 *)&ctrblk); |
90 | } | 62 | } |
91 | 63 | ||
92 | static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, | ||
93 | le128 *iv) | ||
94 | { | ||
95 | be128 ctrblks[CAST6_PARALLEL_BLOCKS]; | ||
96 | unsigned int i; | ||
97 | |||
98 | for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) { | ||
99 | if (dst != src) | ||
100 | dst[i] = src[i]; | ||
101 | |||
102 | le128_to_be128(&ctrblks[i], iv); | ||
103 | le128_inc(iv); | ||
104 | } | ||
105 | |||
106 | cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); | ||
107 | } | ||
108 | |||
109 | static const struct common_glue_ctx cast6_enc = { | 64 | static const struct common_glue_ctx cast6_enc = { |
110 | .num_funcs = 2, | 65 | .num_funcs = 2, |
111 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, | 66 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, |
112 | 67 | ||
113 | .funcs = { { | 68 | .funcs = { { |
114 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 69 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } | 70 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) } |
116 | }, { | 71 | }, { |
117 | .num_blocks = 1, | 72 | .num_blocks = 1, |
118 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } | 73 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } |
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = { | |||
125 | 80 | ||
126 | .funcs = { { | 81 | .funcs = { { |
127 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 82 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
128 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } | 83 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) } |
129 | }, { | 84 | }, { |
130 | .num_blocks = 1, | 85 | .num_blocks = 1, |
131 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } | 86 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } |
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = { | |||
138 | 93 | ||
139 | .funcs = { { | 94 | .funcs = { { |
140 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 95 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
141 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } | 96 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) } |
142 | }, { | 97 | }, { |
143 | .num_blocks = 1, | 98 | .num_blocks = 1, |
144 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } | 99 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } |
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = { | |||
151 | 106 | ||
152 | .funcs = { { | 107 | .funcs = { { |
153 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 108 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
154 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } | 109 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) } |
155 | }, { | 110 | }, { |
156 | .num_blocks = 1, | 111 | .num_blocks = 1, |
157 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } | 112 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } |
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
215 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | 170 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
216 | 171 | ||
217 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { | 172 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { |
218 | cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); | 173 | cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); |
219 | return; | 174 | return; |
220 | } | 175 | } |
221 | 176 | ||
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
232 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | 187 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
233 | 188 | ||
234 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { | 189 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { |
235 | cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); | 190 | cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); |
236 | return; | 191 | return; |
237 | } | 192 | } |
238 | 193 | ||