aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /arch/x86/crypto
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile29
-rw-r--r--arch/x86/crypto/ablk_helper.c149
-rw-r--r--arch/x86/crypto/aes_glue.c3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c1091
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S390
-rw-r--r--arch/x86/crypto/blowfish_glue.c485
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S1102
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S520
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c558
-rw-r--r--arch/x86/crypto/camellia_glue.c1729
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S558
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c497
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S439
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c603
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c284
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S460
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c14
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S91
-rw-r--r--arch/x86/crypto/glue_helper.c307
-rw-r--r--arch/x86/crypto/salsa20_glue.c1
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S754
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S635
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S758
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c595
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c621
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S558
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c240
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S423
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S10
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S316
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c571
-rw-r--r--arch/x86/crypto/twofish_glue.c15
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c499
35 files changed, 596 insertions, 14721 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0ca7c9ac38..c04f1b7a913 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,53 +2,26 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
6obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
7
8obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
9obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
10obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
11obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
12 8
13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
15obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
16obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
17obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
18obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
19obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
20obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
21obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
22obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
23obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
24obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
25obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 12obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
26obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 13obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
27 14
28obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 15obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
29obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
30 16
31aes-i586-y := aes-i586-asm_32.o aes_glue.o 17aes-i586-y := aes-i586-asm_32.o aes_glue.o
32twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 18twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
33salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 19salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
34serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
35 20
36aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 21aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
37camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
38camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
39 camellia_aesni_avx_glue.o
40cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
41cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
42blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
43twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 22twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
44twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
45twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
46salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 23salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
47serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
48serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
49 24
50aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 25aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
26
51ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 27ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
52sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
53crc32c-intel-y := crc32c-intel_glue.o
54crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c
deleted file mode 100644
index 43282fe04a8..00000000000
--- a/arch/x86/crypto/ablk_helper.c
+++ /dev/null
@@ -1,149 +0,0 @@
1/*
2 * Shared async block cipher helpers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/crypto.h>
29#include <linux/init.h>
30#include <linux/module.h>
31#include <crypto/algapi.h>
32#include <crypto/cryptd.h>
33#include <asm/i387.h>
34#include <asm/crypto/ablk_helper.h>
35
36int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
37 unsigned int key_len)
38{
39 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
40 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
41 int err;
42
43 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
44 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
45 & CRYPTO_TFM_REQ_MASK);
46 err = crypto_ablkcipher_setkey(child, key, key_len);
47 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
48 & CRYPTO_TFM_RES_MASK);
49 return err;
50}
51EXPORT_SYMBOL_GPL(ablk_set_key);
52
53int __ablk_encrypt(struct ablkcipher_request *req)
54{
55 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
56 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
57 struct blkcipher_desc desc;
58
59 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
60 desc.info = req->info;
61 desc.flags = 0;
62
63 return crypto_blkcipher_crt(desc.tfm)->encrypt(
64 &desc, req->dst, req->src, req->nbytes);
65}
66EXPORT_SYMBOL_GPL(__ablk_encrypt);
67
68int ablk_encrypt(struct ablkcipher_request *req)
69{
70 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
71 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
72
73 if (!irq_fpu_usable()) {
74 struct ablkcipher_request *cryptd_req =
75 ablkcipher_request_ctx(req);
76
77 memcpy(cryptd_req, req, sizeof(*req));
78 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
79
80 return crypto_ablkcipher_encrypt(cryptd_req);
81 } else {
82 return __ablk_encrypt(req);
83 }
84}
85EXPORT_SYMBOL_GPL(ablk_encrypt);
86
87int ablk_decrypt(struct ablkcipher_request *req)
88{
89 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
90 struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
91
92 if (!irq_fpu_usable()) {
93 struct ablkcipher_request *cryptd_req =
94 ablkcipher_request_ctx(req);
95
96 memcpy(cryptd_req, req, sizeof(*req));
97 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
98
99 return crypto_ablkcipher_decrypt(cryptd_req);
100 } else {
101 struct blkcipher_desc desc;
102
103 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
104 desc.info = req->info;
105 desc.flags = 0;
106
107 return crypto_blkcipher_crt(desc.tfm)->decrypt(
108 &desc, req->dst, req->src, req->nbytes);
109 }
110}
111EXPORT_SYMBOL_GPL(ablk_decrypt);
112
113void ablk_exit(struct crypto_tfm *tfm)
114{
115 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
116
117 cryptd_free_ablkcipher(ctx->cryptd_tfm);
118}
119EXPORT_SYMBOL_GPL(ablk_exit);
120
121int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
122{
123 struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
124 struct cryptd_ablkcipher *cryptd_tfm;
125
126 cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
127 if (IS_ERR(cryptd_tfm))
128 return PTR_ERR(cryptd_tfm);
129
130 ctx->cryptd_tfm = cryptd_tfm;
131 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
132 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
133
134 return 0;
135}
136EXPORT_SYMBOL_GPL(ablk_init_common);
137
138int ablk_init(struct crypto_tfm *tfm)
139{
140 char drv_name[CRYPTO_MAX_ALG_NAME];
141
142 snprintf(drv_name, sizeof(drv_name), "__driver-%s",
143 crypto_tfm_alg_driver_name(tfm));
144
145 return ablk_init_common(tfm, drv_name);
146}
147EXPORT_SYMBOL_GPL(ablk_init);
148
149MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index aafe8ce0d65..49ae9fe32b2 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -3,9 +3,7 @@
3 * 3 *
4 */ 4 */
5 5
6#include <linux/module.h>
7#include <crypto/aes.h> 6#include <crypto/aes.h>
8#include <asm/crypto/aes.h>
9 7
10asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 8asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
11asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); 9asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
@@ -40,6 +38,7 @@ static struct crypto_alg aes_alg = {
40 .cra_blocksize = AES_BLOCK_SIZE, 38 .cra_blocksize = AES_BLOCK_SIZE,
41 .cra_ctxsize = sizeof(struct crypto_aes_ctx), 39 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
42 .cra_module = THIS_MODULE, 40 .cra_module = THIS_MODULE,
41 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
43 .cra_u = { 42 .cra_u = {
44 .cipher = { 43 .cipher = {
45 .cia_min_keysize = AES_MIN_KEY_SIZE, 44 .cia_min_keysize = AES_MIN_KEY_SIZE,
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 3470624d783..be6d9e365a8 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2460,12 +2460,10 @@ ENTRY(aesni_cbc_dec)
2460 pxor IN3, STATE4 2460 pxor IN3, STATE4
2461 movaps IN4, IV 2461 movaps IN4, IV
2462#else 2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2463 pxor IN1, STATE4 2465 pxor IN1, STATE4
2464 movaps IN2, IV 2466 movaps IN2, IV
2465 movups (INP), IN1
2466 pxor IN1, STATE2
2467 movups 0x10(INP), IN2
2468 pxor IN2, STATE3
2469#endif 2467#endif
2470 movups STATE1, (OUTP) 2468 movups STATE1, (OUTP)
2471 movups STATE2, 0x10(OUTP) 2469 movups STATE2, 0x10(OUTP)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1b9c22bea8a..feee8ff1d05 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -22,19 +22,13 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/crypto.h> 24#include <linux/crypto.h>
25#include <linux/module.h>
26#include <linux/err.h> 25#include <linux/err.h>
27#include <crypto/algapi.h> 26#include <crypto/algapi.h>
28#include <crypto/aes.h> 27#include <crypto/aes.h>
29#include <crypto/cryptd.h> 28#include <crypto/cryptd.h>
30#include <crypto/ctr.h> 29#include <crypto/ctr.h>
31#include <crypto/b128ops.h>
32#include <crypto/lrw.h>
33#include <crypto/xts.h>
34#include <asm/cpu_device_id.h>
35#include <asm/i387.h> 30#include <asm/i387.h>
36#include <asm/crypto/aes.h> 31#include <asm/aes.h>
37#include <asm/crypto/ablk_helper.h>
38#include <crypto/scatterwalk.h> 32#include <crypto/scatterwalk.h>
39#include <crypto/internal/aead.h> 33#include <crypto/internal/aead.h>
40#include <linux/workqueue.h> 34#include <linux/workqueue.h>
@@ -44,10 +38,22 @@
44#define HAS_CTR 38#define HAS_CTR
45#endif 39#endif
46 40
41#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
42#define HAS_LRW
43#endif
44
47#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) 45#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
48#define HAS_PCBC 46#define HAS_PCBC
49#endif 47#endif
50 48
49#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
50#define HAS_XTS
51#endif
52
53struct async_aes_ctx {
54 struct cryptd_ablkcipher *cryptd_tfm;
55};
56
51/* This data is stored at the end of the crypto_tfm struct. 57/* This data is stored at the end of the crypto_tfm struct.
52 * It's a type of per "session" data storage location. 58 * It's a type of per "session" data storage location.
53 * This needs to be 16 byte aligned. 59 * This needs to be 16 byte aligned.
@@ -74,16 +80,6 @@ struct aesni_hash_subkey_req_data {
74#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
75#define RFC4106_HASH_SUBKEY_SIZE 16 81#define RFC4106_HASH_SUBKEY_SIZE 16
76 82
77struct aesni_lrw_ctx {
78 struct lrw_table_ctx lrw_table;
79 u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
80};
81
82struct aesni_xts_ctx {
83 u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
84 u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
85};
86
87asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
88 unsigned int key_len); 84 unsigned int key_len);
89asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out, 85asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
@@ -224,6 +220,27 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
224 } 220 }
225} 221}
226 222
223static struct crypto_alg aesni_alg = {
224 .cra_name = "aes",
225 .cra_driver_name = "aes-aesni",
226 .cra_priority = 300,
227 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
228 .cra_blocksize = AES_BLOCK_SIZE,
229 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
230 .cra_alignmask = 0,
231 .cra_module = THIS_MODULE,
232 .cra_list = LIST_HEAD_INIT(aesni_alg.cra_list),
233 .cra_u = {
234 .cipher = {
235 .cia_min_keysize = AES_MIN_KEY_SIZE,
236 .cia_max_keysize = AES_MAX_KEY_SIZE,
237 .cia_setkey = aes_set_key,
238 .cia_encrypt = aes_encrypt,
239 .cia_decrypt = aes_decrypt
240 }
241 }
242};
243
227static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 244static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
228{ 245{
229 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); 246 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
@@ -238,6 +255,27 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
238 aesni_dec(ctx, dst, src); 255 aesni_dec(ctx, dst, src);
239} 256}
240 257
258static struct crypto_alg __aesni_alg = {
259 .cra_name = "__aes-aesni",
260 .cra_driver_name = "__driver-aes-aesni",
261 .cra_priority = 0,
262 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
263 .cra_blocksize = AES_BLOCK_SIZE,
264 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
265 .cra_alignmask = 0,
266 .cra_module = THIS_MODULE,
267 .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
268 .cra_u = {
269 .cipher = {
270 .cia_min_keysize = AES_MIN_KEY_SIZE,
271 .cia_max_keysize = AES_MAX_KEY_SIZE,
272 .cia_setkey = aes_set_key,
273 .cia_encrypt = __aes_encrypt,
274 .cia_decrypt = __aes_decrypt
275 }
276 }
277};
278
241static int ecb_encrypt(struct blkcipher_desc *desc, 279static int ecb_encrypt(struct blkcipher_desc *desc,
242 struct scatterlist *dst, struct scatterlist *src, 280 struct scatterlist *dst, struct scatterlist *src,
243 unsigned int nbytes) 281 unsigned int nbytes)
@@ -286,6 +324,28 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
286 return err; 324 return err;
287} 325}
288 326
327static struct crypto_alg blk_ecb_alg = {
328 .cra_name = "__ecb-aes-aesni",
329 .cra_driver_name = "__driver-ecb-aes-aesni",
330 .cra_priority = 0,
331 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
332 .cra_blocksize = AES_BLOCK_SIZE,
333 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
334 .cra_alignmask = 0,
335 .cra_type = &crypto_blkcipher_type,
336 .cra_module = THIS_MODULE,
337 .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
338 .cra_u = {
339 .blkcipher = {
340 .min_keysize = AES_MIN_KEY_SIZE,
341 .max_keysize = AES_MAX_KEY_SIZE,
342 .setkey = aes_set_key,
343 .encrypt = ecb_encrypt,
344 .decrypt = ecb_decrypt,
345 },
346 },
347};
348
289static int cbc_encrypt(struct blkcipher_desc *desc, 349static int cbc_encrypt(struct blkcipher_desc *desc,
290 struct scatterlist *dst, struct scatterlist *src, 350 struct scatterlist *dst, struct scatterlist *src,
291 unsigned int nbytes) 351 unsigned int nbytes)
@@ -334,6 +394,28 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
334 return err; 394 return err;
335} 395}
336 396
397static struct crypto_alg blk_cbc_alg = {
398 .cra_name = "__cbc-aes-aesni",
399 .cra_driver_name = "__driver-cbc-aes-aesni",
400 .cra_priority = 0,
401 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
402 .cra_blocksize = AES_BLOCK_SIZE,
403 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
404 .cra_alignmask = 0,
405 .cra_type = &crypto_blkcipher_type,
406 .cra_module = THIS_MODULE,
407 .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
408 .cra_u = {
409 .blkcipher = {
410 .min_keysize = AES_MIN_KEY_SIZE,
411 .max_keysize = AES_MAX_KEY_SIZE,
412 .setkey = aes_set_key,
413 .encrypt = cbc_encrypt,
414 .decrypt = cbc_decrypt,
415 },
416 },
417};
418
337#ifdef CONFIG_X86_64 419#ifdef CONFIG_X86_64
338static void ctr_crypt_final(struct crypto_aes_ctx *ctx, 420static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
339 struct blkcipher_walk *walk) 421 struct blkcipher_walk *walk)
@@ -377,199 +459,373 @@ static int ctr_crypt(struct blkcipher_desc *desc,
377 459
378 return err; 460 return err;
379} 461}
462
463static struct crypto_alg blk_ctr_alg = {
464 .cra_name = "__ctr-aes-aesni",
465 .cra_driver_name = "__driver-ctr-aes-aesni",
466 .cra_priority = 0,
467 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
468 .cra_blocksize = 1,
469 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
470 .cra_alignmask = 0,
471 .cra_type = &crypto_blkcipher_type,
472 .cra_module = THIS_MODULE,
473 .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
474 .cra_u = {
475 .blkcipher = {
476 .min_keysize = AES_MIN_KEY_SIZE,
477 .max_keysize = AES_MAX_KEY_SIZE,
478 .ivsize = AES_BLOCK_SIZE,
479 .setkey = aes_set_key,
480 .encrypt = ctr_crypt,
481 .decrypt = ctr_crypt,
482 },
483 },
484};
380#endif 485#endif
381 486
382static int ablk_ecb_init(struct crypto_tfm *tfm) 487static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
488 unsigned int key_len)
383{ 489{
384 return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); 490 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
385} 491 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
492 int err;
386 493
387static int ablk_cbc_init(struct crypto_tfm *tfm) 494 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
388{ 495 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
389 return ablk_init_common(tfm, "__driver-cbc-aes-aesni"); 496 & CRYPTO_TFM_REQ_MASK);
497 err = crypto_ablkcipher_setkey(child, key, key_len);
498 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
499 & CRYPTO_TFM_RES_MASK);
500 return err;
390} 501}
391 502
392#ifdef CONFIG_X86_64 503static int ablk_encrypt(struct ablkcipher_request *req)
393static int ablk_ctr_init(struct crypto_tfm *tfm)
394{ 504{
395 return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); 505 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
396} 506 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
397 507
398#ifdef HAS_CTR 508 if (!irq_fpu_usable()) {
399static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) 509 struct ablkcipher_request *cryptd_req =
400{ 510 ablkcipher_request_ctx(req);
401 return ablk_init_common(tfm, "rfc3686(__driver-ctr-aes-aesni)"); 511 memcpy(cryptd_req, req, sizeof(*req));
512 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
513 return crypto_ablkcipher_encrypt(cryptd_req);
514 } else {
515 struct blkcipher_desc desc;
516 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
517 desc.info = req->info;
518 desc.flags = 0;
519 return crypto_blkcipher_crt(desc.tfm)->encrypt(
520 &desc, req->dst, req->src, req->nbytes);
521 }
402} 522}
403#endif
404#endif
405 523
406#ifdef HAS_PCBC 524static int ablk_decrypt(struct ablkcipher_request *req)
407static int ablk_pcbc_init(struct crypto_tfm *tfm)
408{ 525{
409 return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); 526 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
410} 527 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
411#endif
412 528
413static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) 529 if (!irq_fpu_usable()) {
414{ 530 struct ablkcipher_request *cryptd_req =
415 aesni_ecb_enc(ctx, blks, blks, nbytes); 531 ablkcipher_request_ctx(req);
532 memcpy(cryptd_req, req, sizeof(*req));
533 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
534 return crypto_ablkcipher_decrypt(cryptd_req);
535 } else {
536 struct blkcipher_desc desc;
537 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
538 desc.info = req->info;
539 desc.flags = 0;
540 return crypto_blkcipher_crt(desc.tfm)->decrypt(
541 &desc, req->dst, req->src, req->nbytes);
542 }
416} 543}
417 544
418static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) 545static void ablk_exit(struct crypto_tfm *tfm)
419{ 546{
420 aesni_ecb_dec(ctx, blks, blks, nbytes); 547 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
548
549 cryptd_free_ablkcipher(ctx->cryptd_tfm);
421} 550}
422 551
423static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, 552static void ablk_init_common(struct crypto_tfm *tfm,
424 unsigned int keylen) 553 struct cryptd_ablkcipher *cryptd_tfm)
425{ 554{
426 struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 555 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
427 int err;
428 556
429 err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, 557 ctx->cryptd_tfm = cryptd_tfm;
430 keylen - AES_BLOCK_SIZE); 558 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
431 if (err) 559 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
432 return err;
433
434 return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE);
435} 560}
436 561
437static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) 562static int ablk_ecb_init(struct crypto_tfm *tfm)
438{ 563{
439 struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 564 struct cryptd_ablkcipher *cryptd_tfm;
440 565
441 lrw_free_table(&ctx->lrw_table); 566 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0);
567 if (IS_ERR(cryptd_tfm))
568 return PTR_ERR(cryptd_tfm);
569 ablk_init_common(tfm, cryptd_tfm);
570 return 0;
442} 571}
443 572
444static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 573static struct crypto_alg ablk_ecb_alg = {
445 struct scatterlist *src, unsigned int nbytes) 574 .cra_name = "ecb(aes)",
446{ 575 .cra_driver_name = "ecb-aes-aesni",
447 struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 576 .cra_priority = 400,
448 be128 buf[8]; 577 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
449 struct lrw_crypt_req req = { 578 .cra_blocksize = AES_BLOCK_SIZE,
450 .tbuf = buf, 579 .cra_ctxsize = sizeof(struct async_aes_ctx),
451 .tbuflen = sizeof(buf), 580 .cra_alignmask = 0,
452 581 .cra_type = &crypto_ablkcipher_type,
453 .table_ctx = &ctx->lrw_table, 582 .cra_module = THIS_MODULE,
454 .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), 583 .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
455 .crypt_fn = lrw_xts_encrypt_callback, 584 .cra_init = ablk_ecb_init,
456 }; 585 .cra_exit = ablk_exit,
457 int ret; 586 .cra_u = {
458 587 .ablkcipher = {
459 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 588 .min_keysize = AES_MIN_KEY_SIZE,
589 .max_keysize = AES_MAX_KEY_SIZE,
590 .setkey = ablk_set_key,
591 .encrypt = ablk_encrypt,
592 .decrypt = ablk_decrypt,
593 },
594 },
595};
460 596
461 kernel_fpu_begin(); 597static int ablk_cbc_init(struct crypto_tfm *tfm)
462 ret = lrw_crypt(desc, dst, src, nbytes, &req); 598{
463 kernel_fpu_end(); 599 struct cryptd_ablkcipher *cryptd_tfm;
464 600
465 return ret; 601 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0);
602 if (IS_ERR(cryptd_tfm))
603 return PTR_ERR(cryptd_tfm);
604 ablk_init_common(tfm, cryptd_tfm);
605 return 0;
466} 606}
467 607
468static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 608static struct crypto_alg ablk_cbc_alg = {
469 struct scatterlist *src, unsigned int nbytes) 609 .cra_name = "cbc(aes)",
470{ 610 .cra_driver_name = "cbc-aes-aesni",
471 struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 611 .cra_priority = 400,
472 be128 buf[8]; 612 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
473 struct lrw_crypt_req req = { 613 .cra_blocksize = AES_BLOCK_SIZE,
474 .tbuf = buf, 614 .cra_ctxsize = sizeof(struct async_aes_ctx),
475 .tbuflen = sizeof(buf), 615 .cra_alignmask = 0,
476 616 .cra_type = &crypto_ablkcipher_type,
477 .table_ctx = &ctx->lrw_table, 617 .cra_module = THIS_MODULE,
478 .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), 618 .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
479 .crypt_fn = lrw_xts_decrypt_callback, 619 .cra_init = ablk_cbc_init,
480 }; 620 .cra_exit = ablk_exit,
481 int ret; 621 .cra_u = {
482 622 .ablkcipher = {
483 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 623 .min_keysize = AES_MIN_KEY_SIZE,
624 .max_keysize = AES_MAX_KEY_SIZE,
625 .ivsize = AES_BLOCK_SIZE,
626 .setkey = ablk_set_key,
627 .encrypt = ablk_encrypt,
628 .decrypt = ablk_decrypt,
629 },
630 },
631};
484 632
485 kernel_fpu_begin(); 633#ifdef CONFIG_X86_64
486 ret = lrw_crypt(desc, dst, src, nbytes, &req); 634static int ablk_ctr_init(struct crypto_tfm *tfm)
487 kernel_fpu_end(); 635{
636 struct cryptd_ablkcipher *cryptd_tfm;
488 637
489 return ret; 638 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
639 if (IS_ERR(cryptd_tfm))
640 return PTR_ERR(cryptd_tfm);
641 ablk_init_common(tfm, cryptd_tfm);
642 return 0;
490} 643}
491 644
492static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, 645static struct crypto_alg ablk_ctr_alg = {
493 unsigned int keylen) 646 .cra_name = "ctr(aes)",
494{ 647 .cra_driver_name = "ctr-aes-aesni",
495 struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); 648 .cra_priority = 400,
496 u32 *flags = &tfm->crt_flags; 649 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
497 int err; 650 .cra_blocksize = 1,
498 651 .cra_ctxsize = sizeof(struct async_aes_ctx),
499 /* key consists of keys of equal size concatenated, therefore 652 .cra_alignmask = 0,
500 * the length must be even 653 .cra_type = &crypto_ablkcipher_type,
501 */ 654 .cra_module = THIS_MODULE,
502 if (keylen % 2) { 655 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
503 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; 656 .cra_init = ablk_ctr_init,
504 return -EINVAL; 657 .cra_exit = ablk_exit,
505 } 658 .cra_u = {
659 .ablkcipher = {
660 .min_keysize = AES_MIN_KEY_SIZE,
661 .max_keysize = AES_MAX_KEY_SIZE,
662 .ivsize = AES_BLOCK_SIZE,
663 .setkey = ablk_set_key,
664 .encrypt = ablk_encrypt,
665 .decrypt = ablk_encrypt,
666 .geniv = "chainiv",
667 },
668 },
669};
506 670
507 /* first half of xts-key is for crypt */ 671#ifdef HAS_CTR
508 err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); 672static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
509 if (err) 673{
510 return err; 674 struct cryptd_ablkcipher *cryptd_tfm;
511 675
512 /* second half of xts-key is for tweak */ 676 cryptd_tfm = cryptd_alloc_ablkcipher(
513 return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, 677 "rfc3686(__driver-ctr-aes-aesni)", 0, 0);
514 keylen / 2); 678 if (IS_ERR(cryptd_tfm))
679 return PTR_ERR(cryptd_tfm);
680 ablk_init_common(tfm, cryptd_tfm);
681 return 0;
515} 682}
516 683
684static struct crypto_alg ablk_rfc3686_ctr_alg = {
685 .cra_name = "rfc3686(ctr(aes))",
686 .cra_driver_name = "rfc3686-ctr-aes-aesni",
687 .cra_priority = 400,
688 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
689 .cra_blocksize = 1,
690 .cra_ctxsize = sizeof(struct async_aes_ctx),
691 .cra_alignmask = 0,
692 .cra_type = &crypto_ablkcipher_type,
693 .cra_module = THIS_MODULE,
694 .cra_list = LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
695 .cra_init = ablk_rfc3686_ctr_init,
696 .cra_exit = ablk_exit,
697 .cra_u = {
698 .ablkcipher = {
699 .min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
700 .max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
701 .ivsize = CTR_RFC3686_IV_SIZE,
702 .setkey = ablk_set_key,
703 .encrypt = ablk_encrypt,
704 .decrypt = ablk_decrypt,
705 .geniv = "seqiv",
706 },
707 },
708};
709#endif
710#endif
517 711
518static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) 712#ifdef HAS_LRW
713static int ablk_lrw_init(struct crypto_tfm *tfm)
519{ 714{
520 aesni_enc(ctx, out, in); 715 struct cryptd_ablkcipher *cryptd_tfm;
521}
522 716
523static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 717 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
524 struct scatterlist *src, unsigned int nbytes) 718 0, 0);
525{ 719 if (IS_ERR(cryptd_tfm))
526 struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 720 return PTR_ERR(cryptd_tfm);
527 be128 buf[8]; 721 ablk_init_common(tfm, cryptd_tfm);
528 struct xts_crypt_req req = { 722 return 0;
529 .tbuf = buf, 723}
530 .tbuflen = sizeof(buf),
531
532 .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx),
533 .tweak_fn = aesni_xts_tweak,
534 .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx),
535 .crypt_fn = lrw_xts_encrypt_callback,
536 };
537 int ret;
538 724
539 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 725static struct crypto_alg ablk_lrw_alg = {
726 .cra_name = "lrw(aes)",
727 .cra_driver_name = "lrw-aes-aesni",
728 .cra_priority = 400,
729 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
730 .cra_blocksize = AES_BLOCK_SIZE,
731 .cra_ctxsize = sizeof(struct async_aes_ctx),
732 .cra_alignmask = 0,
733 .cra_type = &crypto_ablkcipher_type,
734 .cra_module = THIS_MODULE,
735 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
736 .cra_init = ablk_lrw_init,
737 .cra_exit = ablk_exit,
738 .cra_u = {
739 .ablkcipher = {
740 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
741 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
742 .ivsize = AES_BLOCK_SIZE,
743 .setkey = ablk_set_key,
744 .encrypt = ablk_encrypt,
745 .decrypt = ablk_decrypt,
746 },
747 },
748};
749#endif
540 750
541 kernel_fpu_begin(); 751#ifdef HAS_PCBC
542 ret = xts_crypt(desc, dst, src, nbytes, &req); 752static int ablk_pcbc_init(struct crypto_tfm *tfm)
543 kernel_fpu_end(); 753{
754 struct cryptd_ablkcipher *cryptd_tfm;
544 755
545 return ret; 756 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
757 0, 0);
758 if (IS_ERR(cryptd_tfm))
759 return PTR_ERR(cryptd_tfm);
760 ablk_init_common(tfm, cryptd_tfm);
761 return 0;
546} 762}
547 763
548static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 764static struct crypto_alg ablk_pcbc_alg = {
549 struct scatterlist *src, unsigned int nbytes) 765 .cra_name = "pcbc(aes)",
550{ 766 .cra_driver_name = "pcbc-aes-aesni",
551 struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 767 .cra_priority = 400,
552 be128 buf[8]; 768 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
553 struct xts_crypt_req req = { 769 .cra_blocksize = AES_BLOCK_SIZE,
554 .tbuf = buf, 770 .cra_ctxsize = sizeof(struct async_aes_ctx),
555 .tbuflen = sizeof(buf), 771 .cra_alignmask = 0,
556 772 .cra_type = &crypto_ablkcipher_type,
557 .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), 773 .cra_module = THIS_MODULE,
558 .tweak_fn = aesni_xts_tweak, 774 .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
559 .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), 775 .cra_init = ablk_pcbc_init,
560 .crypt_fn = lrw_xts_decrypt_callback, 776 .cra_exit = ablk_exit,
561 }; 777 .cra_u = {
562 int ret; 778 .ablkcipher = {
563 779 .min_keysize = AES_MIN_KEY_SIZE,
564 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 780 .max_keysize = AES_MAX_KEY_SIZE,
781 .ivsize = AES_BLOCK_SIZE,
782 .setkey = ablk_set_key,
783 .encrypt = ablk_encrypt,
784 .decrypt = ablk_decrypt,
785 },
786 },
787};
788#endif
565 789
566 kernel_fpu_begin(); 790#ifdef HAS_XTS
567 ret = xts_crypt(desc, dst, src, nbytes, &req); 791static int ablk_xts_init(struct crypto_tfm *tfm)
568 kernel_fpu_end(); 792{
793 struct cryptd_ablkcipher *cryptd_tfm;
569 794
570 return ret; 795 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
796 0, 0);
797 if (IS_ERR(cryptd_tfm))
798 return PTR_ERR(cryptd_tfm);
799 ablk_init_common(tfm, cryptd_tfm);
800 return 0;
571} 801}
572 802
803static struct crypto_alg ablk_xts_alg = {
804 .cra_name = "xts(aes)",
805 .cra_driver_name = "xts-aes-aesni",
806 .cra_priority = 400,
807 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
808 .cra_blocksize = AES_BLOCK_SIZE,
809 .cra_ctxsize = sizeof(struct async_aes_ctx),
810 .cra_alignmask = 0,
811 .cra_type = &crypto_ablkcipher_type,
812 .cra_module = THIS_MODULE,
813 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
814 .cra_init = ablk_xts_init,
815 .cra_exit = ablk_exit,
816 .cra_u = {
817 .ablkcipher = {
818 .min_keysize = 2 * AES_MIN_KEY_SIZE,
819 .max_keysize = 2 * AES_MAX_KEY_SIZE,
820 .ivsize = AES_BLOCK_SIZE,
821 .setkey = ablk_set_key,
822 .encrypt = ablk_encrypt,
823 .decrypt = ablk_decrypt,
824 },
825 },
826};
827#endif
828
573#ifdef CONFIG_X86_64 829#ifdef CONFIG_X86_64
574static int rfc4106_init(struct crypto_tfm *tfm) 830static int rfc4106_init(struct crypto_tfm *tfm)
575{ 831{
@@ -680,7 +936,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
680 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); 936 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
681 struct aesni_rfc4106_gcm_ctx *child_ctx = 937 struct aesni_rfc4106_gcm_ctx *child_ctx =
682 aesni_rfc4106_gcm_ctx_get(cryptd_child); 938 aesni_rfc4106_gcm_ctx_get(cryptd_child);
683 u8 *new_key_align, *new_key_mem = NULL; 939 u8 *new_key_mem = NULL;
684 940
685 if (key_len < 4) { 941 if (key_len < 4) {
686 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 942 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -704,9 +960,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
704 if (!new_key_mem) 960 if (!new_key_mem)
705 return -ENOMEM; 961 return -ENOMEM;
706 962
707 new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); 963 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
708 memcpy(new_key_align, key, key_len); 964 memcpy(new_key_mem, key, key_len);
709 key = new_key_align; 965 key = new_key_mem;
710 } 966 }
711 967
712 if (!irq_fpu_usable()) 968 if (!irq_fpu_usable())
@@ -792,6 +1048,32 @@ static int rfc4106_decrypt(struct aead_request *req)
792 } 1048 }
793} 1049}
794 1050
1051static struct crypto_alg rfc4106_alg = {
1052 .cra_name = "rfc4106(gcm(aes))",
1053 .cra_driver_name = "rfc4106-gcm-aesni",
1054 .cra_priority = 400,
1055 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1056 .cra_blocksize = 1,
1057 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1058 .cra_alignmask = 0,
1059 .cra_type = &crypto_nivaead_type,
1060 .cra_module = THIS_MODULE,
1061 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1062 .cra_init = rfc4106_init,
1063 .cra_exit = rfc4106_exit,
1064 .cra_u = {
1065 .aead = {
1066 .setkey = rfc4106_set_key,
1067 .setauthsize = rfc4106_set_authsize,
1068 .encrypt = rfc4106_encrypt,
1069 .decrypt = rfc4106_decrypt,
1070 .geniv = "seqiv",
1071 .ivsize = 8,
1072 .maxauthsize = 16,
1073 },
1074 },
1075};
1076
795static int __driver_rfc4106_encrypt(struct aead_request *req) 1077static int __driver_rfc4106_encrypt(struct aead_request *req)
796{ 1078{
797 u8 one_entry_in_sg = 0; 1079 u8 one_entry_in_sg = 0;
@@ -824,12 +1106,12 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
824 one_entry_in_sg = 1; 1106 one_entry_in_sg = 1;
825 scatterwalk_start(&src_sg_walk, req->src); 1107 scatterwalk_start(&src_sg_walk, req->src);
826 scatterwalk_start(&assoc_sg_walk, req->assoc); 1108 scatterwalk_start(&assoc_sg_walk, req->assoc);
827 src = scatterwalk_map(&src_sg_walk); 1109 src = scatterwalk_map(&src_sg_walk, 0);
828 assoc = scatterwalk_map(&assoc_sg_walk); 1110 assoc = scatterwalk_map(&assoc_sg_walk, 0);
829 dst = src; 1111 dst = src;
830 if (unlikely(req->src != req->dst)) { 1112 if (unlikely(req->src != req->dst)) {
831 scatterwalk_start(&dst_sg_walk, req->dst); 1113 scatterwalk_start(&dst_sg_walk, req->dst);
832 dst = scatterwalk_map(&dst_sg_walk); 1114 dst = scatterwalk_map(&dst_sg_walk, 0);
833 } 1115 }
834 1116
835 } else { 1117 } else {
@@ -853,11 +1135,11 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
853 * back to the packet. */ 1135 * back to the packet. */
854 if (one_entry_in_sg) { 1136 if (one_entry_in_sg) {
855 if (unlikely(req->src != req->dst)) { 1137 if (unlikely(req->src != req->dst)) {
856 scatterwalk_unmap(dst); 1138 scatterwalk_unmap(dst, 0);
857 scatterwalk_done(&dst_sg_walk, 0, 0); 1139 scatterwalk_done(&dst_sg_walk, 0, 0);
858 } 1140 }
859 scatterwalk_unmap(src); 1141 scatterwalk_unmap(src, 0);
860 scatterwalk_unmap(assoc); 1142 scatterwalk_unmap(assoc, 0);
861 scatterwalk_done(&src_sg_walk, 0, 0); 1143 scatterwalk_done(&src_sg_walk, 0, 0);
862 scatterwalk_done(&assoc_sg_walk, 0, 0); 1144 scatterwalk_done(&assoc_sg_walk, 0, 0);
863 } else { 1145 } else {
@@ -906,12 +1188,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
906 one_entry_in_sg = 1; 1188 one_entry_in_sg = 1;
907 scatterwalk_start(&src_sg_walk, req->src); 1189 scatterwalk_start(&src_sg_walk, req->src);
908 scatterwalk_start(&assoc_sg_walk, req->assoc); 1190 scatterwalk_start(&assoc_sg_walk, req->assoc);
909 src = scatterwalk_map(&src_sg_walk); 1191 src = scatterwalk_map(&src_sg_walk, 0);
910 assoc = scatterwalk_map(&assoc_sg_walk); 1192 assoc = scatterwalk_map(&assoc_sg_walk, 0);
911 dst = src; 1193 dst = src;
912 if (unlikely(req->src != req->dst)) { 1194 if (unlikely(req->src != req->dst)) {
913 scatterwalk_start(&dst_sg_walk, req->dst); 1195 scatterwalk_start(&dst_sg_walk, req->dst);
914 dst = scatterwalk_map(&dst_sg_walk); 1196 dst = scatterwalk_map(&dst_sg_walk, 0);
915 } 1197 }
916 1198
917 } else { 1199 } else {
@@ -936,11 +1218,11 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
936 1218
937 if (one_entry_in_sg) { 1219 if (one_entry_in_sg) {
938 if (unlikely(req->src != req->dst)) { 1220 if (unlikely(req->src != req->dst)) {
939 scatterwalk_unmap(dst); 1221 scatterwalk_unmap(dst, 0);
940 scatterwalk_done(&dst_sg_walk, 0, 0); 1222 scatterwalk_done(&dst_sg_walk, 0, 0);
941 } 1223 }
942 scatterwalk_unmap(src); 1224 scatterwalk_unmap(src, 0);
943 scatterwalk_unmap(assoc); 1225 scatterwalk_unmap(assoc, 0);
944 scatterwalk_done(&src_sg_walk, 0, 0); 1226 scatterwalk_done(&src_sg_walk, 0, 0);
945 scatterwalk_done(&assoc_sg_walk, 0, 0); 1227 scatterwalk_done(&assoc_sg_walk, 0, 0);
946 } else { 1228 } else {
@@ -949,378 +1231,145 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
949 } 1231 }
950 return retval; 1232 return retval;
951} 1233}
952#endif
953 1234
954static struct crypto_alg aesni_algs[] = { { 1235static struct crypto_alg __rfc4106_alg = {
955 .cra_name = "aes",
956 .cra_driver_name = "aes-aesni",
957 .cra_priority = 300,
958 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
959 .cra_blocksize = AES_BLOCK_SIZE,
960 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
961 AESNI_ALIGN - 1,
962 .cra_alignmask = 0,
963 .cra_module = THIS_MODULE,
964 .cra_u = {
965 .cipher = {
966 .cia_min_keysize = AES_MIN_KEY_SIZE,
967 .cia_max_keysize = AES_MAX_KEY_SIZE,
968 .cia_setkey = aes_set_key,
969 .cia_encrypt = aes_encrypt,
970 .cia_decrypt = aes_decrypt
971 }
972 }
973}, {
974 .cra_name = "__aes-aesni",
975 .cra_driver_name = "__driver-aes-aesni",
976 .cra_priority = 0,
977 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
978 .cra_blocksize = AES_BLOCK_SIZE,
979 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
980 AESNI_ALIGN - 1,
981 .cra_alignmask = 0,
982 .cra_module = THIS_MODULE,
983 .cra_u = {
984 .cipher = {
985 .cia_min_keysize = AES_MIN_KEY_SIZE,
986 .cia_max_keysize = AES_MAX_KEY_SIZE,
987 .cia_setkey = aes_set_key,
988 .cia_encrypt = __aes_encrypt,
989 .cia_decrypt = __aes_decrypt
990 }
991 }
992}, {
993 .cra_name = "__ecb-aes-aesni",
994 .cra_driver_name = "__driver-ecb-aes-aesni",
995 .cra_priority = 0,
996 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
997 .cra_blocksize = AES_BLOCK_SIZE,
998 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
999 AESNI_ALIGN - 1,
1000 .cra_alignmask = 0,
1001 .cra_type = &crypto_blkcipher_type,
1002 .cra_module = THIS_MODULE,
1003 .cra_u = {
1004 .blkcipher = {
1005 .min_keysize = AES_MIN_KEY_SIZE,
1006 .max_keysize = AES_MAX_KEY_SIZE,
1007 .setkey = aes_set_key,
1008 .encrypt = ecb_encrypt,
1009 .decrypt = ecb_decrypt,
1010 },
1011 },
1012}, {
1013 .cra_name = "__cbc-aes-aesni",
1014 .cra_driver_name = "__driver-cbc-aes-aesni",
1015 .cra_priority = 0,
1016 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1017 .cra_blocksize = AES_BLOCK_SIZE,
1018 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1019 AESNI_ALIGN - 1,
1020 .cra_alignmask = 0,
1021 .cra_type = &crypto_blkcipher_type,
1022 .cra_module = THIS_MODULE,
1023 .cra_u = {
1024 .blkcipher = {
1025 .min_keysize = AES_MIN_KEY_SIZE,
1026 .max_keysize = AES_MAX_KEY_SIZE,
1027 .setkey = aes_set_key,
1028 .encrypt = cbc_encrypt,
1029 .decrypt = cbc_decrypt,
1030 },
1031 },
1032}, {
1033 .cra_name = "ecb(aes)",
1034 .cra_driver_name = "ecb-aes-aesni",
1035 .cra_priority = 400,
1036 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1037 .cra_blocksize = AES_BLOCK_SIZE,
1038 .cra_ctxsize = sizeof(struct async_helper_ctx),
1039 .cra_alignmask = 0,
1040 .cra_type = &crypto_ablkcipher_type,
1041 .cra_module = THIS_MODULE,
1042 .cra_init = ablk_ecb_init,
1043 .cra_exit = ablk_exit,
1044 .cra_u = {
1045 .ablkcipher = {
1046 .min_keysize = AES_MIN_KEY_SIZE,
1047 .max_keysize = AES_MAX_KEY_SIZE,
1048 .setkey = ablk_set_key,
1049 .encrypt = ablk_encrypt,
1050 .decrypt = ablk_decrypt,
1051 },
1052 },
1053}, {
1054 .cra_name = "cbc(aes)",
1055 .cra_driver_name = "cbc-aes-aesni",
1056 .cra_priority = 400,
1057 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1058 .cra_blocksize = AES_BLOCK_SIZE,
1059 .cra_ctxsize = sizeof(struct async_helper_ctx),
1060 .cra_alignmask = 0,
1061 .cra_type = &crypto_ablkcipher_type,
1062 .cra_module = THIS_MODULE,
1063 .cra_init = ablk_cbc_init,
1064 .cra_exit = ablk_exit,
1065 .cra_u = {
1066 .ablkcipher = {
1067 .min_keysize = AES_MIN_KEY_SIZE,
1068 .max_keysize = AES_MAX_KEY_SIZE,
1069 .ivsize = AES_BLOCK_SIZE,
1070 .setkey = ablk_set_key,
1071 .encrypt = ablk_encrypt,
1072 .decrypt = ablk_decrypt,
1073 },
1074 },
1075#ifdef CONFIG_X86_64
1076}, {
1077 .cra_name = "__ctr-aes-aesni",
1078 .cra_driver_name = "__driver-ctr-aes-aesni",
1079 .cra_priority = 0,
1080 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1081 .cra_blocksize = 1,
1082 .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
1083 AESNI_ALIGN - 1,
1084 .cra_alignmask = 0,
1085 .cra_type = &crypto_blkcipher_type,
1086 .cra_module = THIS_MODULE,
1087 .cra_u = {
1088 .blkcipher = {
1089 .min_keysize = AES_MIN_KEY_SIZE,
1090 .max_keysize = AES_MAX_KEY_SIZE,
1091 .ivsize = AES_BLOCK_SIZE,
1092 .setkey = aes_set_key,
1093 .encrypt = ctr_crypt,
1094 .decrypt = ctr_crypt,
1095 },
1096 },
1097}, {
1098 .cra_name = "ctr(aes)",
1099 .cra_driver_name = "ctr-aes-aesni",
1100 .cra_priority = 400,
1101 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1102 .cra_blocksize = 1,
1103 .cra_ctxsize = sizeof(struct async_helper_ctx),
1104 .cra_alignmask = 0,
1105 .cra_type = &crypto_ablkcipher_type,
1106 .cra_module = THIS_MODULE,
1107 .cra_init = ablk_ctr_init,
1108 .cra_exit = ablk_exit,
1109 .cra_u = {
1110 .ablkcipher = {
1111 .min_keysize = AES_MIN_KEY_SIZE,
1112 .max_keysize = AES_MAX_KEY_SIZE,
1113 .ivsize = AES_BLOCK_SIZE,
1114 .setkey = ablk_set_key,
1115 .encrypt = ablk_encrypt,
1116 .decrypt = ablk_encrypt,
1117 .geniv = "chainiv",
1118 },
1119 },
1120}, {
1121 .cra_name = "__gcm-aes-aesni", 1236 .cra_name = "__gcm-aes-aesni",
1122 .cra_driver_name = "__driver-gcm-aes-aesni", 1237 .cra_driver_name = "__driver-gcm-aes-aesni",
1123 .cra_priority = 0, 1238 .cra_priority = 0,
1124 .cra_flags = CRYPTO_ALG_TYPE_AEAD, 1239 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1125 .cra_blocksize = 1, 1240 .cra_blocksize = 1,
1126 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + 1241 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1127 AESNI_ALIGN,
1128 .cra_alignmask = 0, 1242 .cra_alignmask = 0,
1129 .cra_type = &crypto_aead_type, 1243 .cra_type = &crypto_aead_type,
1130 .cra_module = THIS_MODULE, 1244 .cra_module = THIS_MODULE,
1245 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1131 .cra_u = { 1246 .cra_u = {
1132 .aead = { 1247 .aead = {
1133 .encrypt = __driver_rfc4106_encrypt, 1248 .encrypt = __driver_rfc4106_encrypt,
1134 .decrypt = __driver_rfc4106_decrypt, 1249 .decrypt = __driver_rfc4106_decrypt,
1135 }, 1250 },
1136 }, 1251 },
1137}, {
1138 .cra_name = "rfc4106(gcm(aes))",
1139 .cra_driver_name = "rfc4106-gcm-aesni",
1140 .cra_priority = 400,
1141 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1142 .cra_blocksize = 1,
1143 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) +
1144 AESNI_ALIGN,
1145 .cra_alignmask = 0,
1146 .cra_type = &crypto_nivaead_type,
1147 .cra_module = THIS_MODULE,
1148 .cra_init = rfc4106_init,
1149 .cra_exit = rfc4106_exit,
1150 .cra_u = {
1151 .aead = {
1152 .setkey = rfc4106_set_key,
1153 .setauthsize = rfc4106_set_authsize,
1154 .encrypt = rfc4106_encrypt,
1155 .decrypt = rfc4106_decrypt,
1156 .geniv = "seqiv",
1157 .ivsize = 8,
1158 .maxauthsize = 16,
1159 },
1160 },
1161#ifdef HAS_CTR
1162}, {
1163 .cra_name = "rfc3686(ctr(aes))",
1164 .cra_driver_name = "rfc3686-ctr-aes-aesni",
1165 .cra_priority = 400,
1166 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1167 .cra_blocksize = 1,
1168 .cra_ctxsize = sizeof(struct async_helper_ctx),
1169 .cra_alignmask = 0,
1170 .cra_type = &crypto_ablkcipher_type,
1171 .cra_module = THIS_MODULE,
1172 .cra_init = ablk_rfc3686_ctr_init,
1173 .cra_exit = ablk_exit,
1174 .cra_u = {
1175 .ablkcipher = {
1176 .min_keysize = AES_MIN_KEY_SIZE +
1177 CTR_RFC3686_NONCE_SIZE,
1178 .max_keysize = AES_MAX_KEY_SIZE +
1179 CTR_RFC3686_NONCE_SIZE,
1180 .ivsize = CTR_RFC3686_IV_SIZE,
1181 .setkey = ablk_set_key,
1182 .encrypt = ablk_encrypt,
1183 .decrypt = ablk_decrypt,
1184 .geniv = "seqiv",
1185 },
1186 },
1187#endif
1188#endif
1189#ifdef HAS_PCBC
1190}, {
1191 .cra_name = "pcbc(aes)",
1192 .cra_driver_name = "pcbc-aes-aesni",
1193 .cra_priority = 400,
1194 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1195 .cra_blocksize = AES_BLOCK_SIZE,
1196 .cra_ctxsize = sizeof(struct async_helper_ctx),
1197 .cra_alignmask = 0,
1198 .cra_type = &crypto_ablkcipher_type,
1199 .cra_module = THIS_MODULE,
1200 .cra_init = ablk_pcbc_init,
1201 .cra_exit = ablk_exit,
1202 .cra_u = {
1203 .ablkcipher = {
1204 .min_keysize = AES_MIN_KEY_SIZE,
1205 .max_keysize = AES_MAX_KEY_SIZE,
1206 .ivsize = AES_BLOCK_SIZE,
1207 .setkey = ablk_set_key,
1208 .encrypt = ablk_encrypt,
1209 .decrypt = ablk_decrypt,
1210 },
1211 },
1212#endif
1213}, {
1214 .cra_name = "__lrw-aes-aesni",
1215 .cra_driver_name = "__driver-lrw-aes-aesni",
1216 .cra_priority = 0,
1217 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1218 .cra_blocksize = AES_BLOCK_SIZE,
1219 .cra_ctxsize = sizeof(struct aesni_lrw_ctx),
1220 .cra_alignmask = 0,
1221 .cra_type = &crypto_blkcipher_type,
1222 .cra_module = THIS_MODULE,
1223 .cra_exit = lrw_aesni_exit_tfm,
1224 .cra_u = {
1225 .blkcipher = {
1226 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
1227 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
1228 .ivsize = AES_BLOCK_SIZE,
1229 .setkey = lrw_aesni_setkey,
1230 .encrypt = lrw_encrypt,
1231 .decrypt = lrw_decrypt,
1232 },
1233 },
1234}, {
1235 .cra_name = "__xts-aes-aesni",
1236 .cra_driver_name = "__driver-xts-aes-aesni",
1237 .cra_priority = 0,
1238 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1239 .cra_blocksize = AES_BLOCK_SIZE,
1240 .cra_ctxsize = sizeof(struct aesni_xts_ctx),
1241 .cra_alignmask = 0,
1242 .cra_type = &crypto_blkcipher_type,
1243 .cra_module = THIS_MODULE,
1244 .cra_u = {
1245 .blkcipher = {
1246 .min_keysize = 2 * AES_MIN_KEY_SIZE,
1247 .max_keysize = 2 * AES_MAX_KEY_SIZE,
1248 .ivsize = AES_BLOCK_SIZE,
1249 .setkey = xts_aesni_setkey,
1250 .encrypt = xts_encrypt,
1251 .decrypt = xts_decrypt,
1252 },
1253 },
1254}, {
1255 .cra_name = "lrw(aes)",
1256 .cra_driver_name = "lrw-aes-aesni",
1257 .cra_priority = 400,
1258 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1259 .cra_blocksize = AES_BLOCK_SIZE,
1260 .cra_ctxsize = sizeof(struct async_helper_ctx),
1261 .cra_alignmask = 0,
1262 .cra_type = &crypto_ablkcipher_type,
1263 .cra_module = THIS_MODULE,
1264 .cra_init = ablk_init,
1265 .cra_exit = ablk_exit,
1266 .cra_u = {
1267 .ablkcipher = {
1268 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
1269 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
1270 .ivsize = AES_BLOCK_SIZE,
1271 .setkey = ablk_set_key,
1272 .encrypt = ablk_encrypt,
1273 .decrypt = ablk_decrypt,
1274 },
1275 },
1276}, {
1277 .cra_name = "xts(aes)",
1278 .cra_driver_name = "xts-aes-aesni",
1279 .cra_priority = 400,
1280 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
1281 .cra_blocksize = AES_BLOCK_SIZE,
1282 .cra_ctxsize = sizeof(struct async_helper_ctx),
1283 .cra_alignmask = 0,
1284 .cra_type = &crypto_ablkcipher_type,
1285 .cra_module = THIS_MODULE,
1286 .cra_init = ablk_init,
1287 .cra_exit = ablk_exit,
1288 .cra_u = {
1289 .ablkcipher = {
1290 .min_keysize = 2 * AES_MIN_KEY_SIZE,
1291 .max_keysize = 2 * AES_MAX_KEY_SIZE,
1292 .ivsize = AES_BLOCK_SIZE,
1293 .setkey = ablk_set_key,
1294 .encrypt = ablk_encrypt,
1295 .decrypt = ablk_decrypt,
1296 },
1297 },
1298} };
1299
1300
1301static const struct x86_cpu_id aesni_cpu_id[] = {
1302 X86_FEATURE_MATCH(X86_FEATURE_AES),
1303 {}
1304}; 1252};
1305MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); 1253#endif
1306 1254
1307static int __init aesni_init(void) 1255static int __init aesni_init(void)
1308{ 1256{
1309 int err; 1257 int err;
1310 1258
1311 if (!x86_match_cpu(aesni_cpu_id)) 1259 if (!cpu_has_aes) {
1260 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
1312 return -ENODEV; 1261 return -ENODEV;
1262 }
1313 1263
1314 err = crypto_fpu_init(); 1264 if ((err = crypto_fpu_init()))
1315 if (err) 1265 goto fpu_err;
1316 return err; 1266 if ((err = crypto_register_alg(&aesni_alg)))
1267 goto aes_err;
1268 if ((err = crypto_register_alg(&__aesni_alg)))
1269 goto __aes_err;
1270 if ((err = crypto_register_alg(&blk_ecb_alg)))
1271 goto blk_ecb_err;
1272 if ((err = crypto_register_alg(&blk_cbc_alg)))
1273 goto blk_cbc_err;
1274 if ((err = crypto_register_alg(&ablk_ecb_alg)))
1275 goto ablk_ecb_err;
1276 if ((err = crypto_register_alg(&ablk_cbc_alg)))
1277 goto ablk_cbc_err;
1278#ifdef CONFIG_X86_64
1279 if ((err = crypto_register_alg(&blk_ctr_alg)))
1280 goto blk_ctr_err;
1281 if ((err = crypto_register_alg(&ablk_ctr_alg)))
1282 goto ablk_ctr_err;
1283 if ((err = crypto_register_alg(&__rfc4106_alg)))
1284 goto __aead_gcm_err;
1285 if ((err = crypto_register_alg(&rfc4106_alg)))
1286 goto aead_gcm_err;
1287#ifdef HAS_CTR
1288 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
1289 goto ablk_rfc3686_ctr_err;
1290#endif
1291#endif
1292#ifdef HAS_LRW
1293 if ((err = crypto_register_alg(&ablk_lrw_alg)))
1294 goto ablk_lrw_err;
1295#endif
1296#ifdef HAS_PCBC
1297 if ((err = crypto_register_alg(&ablk_pcbc_alg)))
1298 goto ablk_pcbc_err;
1299#endif
1300#ifdef HAS_XTS
1301 if ((err = crypto_register_alg(&ablk_xts_alg)))
1302 goto ablk_xts_err;
1303#endif
1304 return err;
1317 1305
1318 return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); 1306#ifdef HAS_XTS
1307ablk_xts_err:
1308#endif
1309#ifdef HAS_PCBC
1310 crypto_unregister_alg(&ablk_pcbc_alg);
1311ablk_pcbc_err:
1312#endif
1313#ifdef HAS_LRW
1314 crypto_unregister_alg(&ablk_lrw_alg);
1315ablk_lrw_err:
1316#endif
1317#ifdef CONFIG_X86_64
1318#ifdef HAS_CTR
1319 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
1320ablk_rfc3686_ctr_err:
1321#endif
1322 crypto_unregister_alg(&rfc4106_alg);
1323aead_gcm_err:
1324 crypto_unregister_alg(&__rfc4106_alg);
1325__aead_gcm_err:
1326 crypto_unregister_alg(&ablk_ctr_alg);
1327ablk_ctr_err:
1328 crypto_unregister_alg(&blk_ctr_alg);
1329blk_ctr_err:
1330#endif
1331 crypto_unregister_alg(&ablk_cbc_alg);
1332ablk_cbc_err:
1333 crypto_unregister_alg(&ablk_ecb_alg);
1334ablk_ecb_err:
1335 crypto_unregister_alg(&blk_cbc_alg);
1336blk_cbc_err:
1337 crypto_unregister_alg(&blk_ecb_alg);
1338blk_ecb_err:
1339 crypto_unregister_alg(&__aesni_alg);
1340__aes_err:
1341 crypto_unregister_alg(&aesni_alg);
1342aes_err:
1343fpu_err:
1344 return err;
1319} 1345}
1320 1346
1321static void __exit aesni_exit(void) 1347static void __exit aesni_exit(void)
1322{ 1348{
1323 crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); 1349#ifdef HAS_XTS
1350 crypto_unregister_alg(&ablk_xts_alg);
1351#endif
1352#ifdef HAS_PCBC
1353 crypto_unregister_alg(&ablk_pcbc_alg);
1354#endif
1355#ifdef HAS_LRW
1356 crypto_unregister_alg(&ablk_lrw_alg);
1357#endif
1358#ifdef CONFIG_X86_64
1359#ifdef HAS_CTR
1360 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
1361#endif
1362 crypto_unregister_alg(&rfc4106_alg);
1363 crypto_unregister_alg(&__rfc4106_alg);
1364 crypto_unregister_alg(&ablk_ctr_alg);
1365 crypto_unregister_alg(&blk_ctr_alg);
1366#endif
1367 crypto_unregister_alg(&ablk_cbc_alg);
1368 crypto_unregister_alg(&ablk_ecb_alg);
1369 crypto_unregister_alg(&blk_cbc_alg);
1370 crypto_unregister_alg(&blk_ecb_alg);
1371 crypto_unregister_alg(&__aesni_alg);
1372 crypto_unregister_alg(&aesni_alg);
1324 1373
1325 crypto_fpu_exit(); 1374 crypto_fpu_exit();
1326} 1375}
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
deleted file mode 100644
index 391d245dc08..00000000000
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ /dev/null
@@ -1,390 +0,0 @@
1/*
2 * Blowfish Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23.file "blowfish-x86_64-asm.S"
24.text
25
26/* structure of crypto context */
27#define p 0
28#define s0 ((16 + 2) * 4)
29#define s1 ((16 + 2 + (1 * 256)) * 4)
30#define s2 ((16 + 2 + (2 * 256)) * 4)
31#define s3 ((16 + 2 + (3 * 256)) * 4)
32
33/* register macros */
34#define CTX %rdi
35#define RIO %rsi
36
37#define RX0 %rax
38#define RX1 %rbx
39#define RX2 %rcx
40#define RX3 %rdx
41
42#define RX0d %eax
43#define RX1d %ebx
44#define RX2d %ecx
45#define RX3d %edx
46
47#define RX0bl %al
48#define RX1bl %bl
49#define RX2bl %cl
50#define RX3bl %dl
51
52#define RX0bh %ah
53#define RX1bh %bh
54#define RX2bh %ch
55#define RX3bh %dh
56
57#define RT0 %rbp
58#define RT1 %rsi
59#define RT2 %r8
60#define RT3 %r9
61
62#define RT0d %ebp
63#define RT1d %esi
64#define RT2d %r8d
65#define RT3d %r9d
66
67#define RKEY %r10
68
69/***********************************************************************
70 * 1-way blowfish
71 ***********************************************************************/
72#define F() \
73 rorq $16, RX0; \
74 movzbl RX0bh, RT0d; \
75 movzbl RX0bl, RT1d; \
76 rolq $16, RX0; \
77 movl s0(CTX,RT0,4), RT0d; \
78 addl s1(CTX,RT1,4), RT0d; \
79 movzbl RX0bh, RT1d; \
80 movzbl RX0bl, RT2d; \
81 rolq $32, RX0; \
82 xorl s2(CTX,RT1,4), RT0d; \
83 addl s3(CTX,RT2,4), RT0d; \
84 xorq RT0, RX0;
85
86#define add_roundkey_enc(n) \
87 xorq p+4*(n)(CTX), RX0;
88
89#define round_enc(n) \
90 add_roundkey_enc(n); \
91 \
92 F(); \
93 F();
94
95#define add_roundkey_dec(n) \
96 movq p+4*(n-1)(CTX), RT0; \
97 rorq $32, RT0; \
98 xorq RT0, RX0;
99
100#define round_dec(n) \
101 add_roundkey_dec(n); \
102 \
103 F(); \
104 F(); \
105
106#define read_block() \
107 movq (RIO), RX0; \
108 rorq $32, RX0; \
109 bswapq RX0;
110
111#define write_block() \
112 bswapq RX0; \
113 movq RX0, (RIO);
114
115#define xor_block() \
116 bswapq RX0; \
117 xorq RX0, (RIO);
118
119.align 8
120.global __blowfish_enc_blk
121.type __blowfish_enc_blk,@function;
122
123__blowfish_enc_blk:
124 /* input:
125 * %rdi: ctx, CTX
126 * %rsi: dst
127 * %rdx: src
128 * %rcx: bool, if true: xor output
129 */
130 movq %rbp, %r11;
131
132 movq %rsi, %r10;
133 movq %rdx, RIO;
134
135 read_block();
136
137 round_enc(0);
138 round_enc(2);
139 round_enc(4);
140 round_enc(6);
141 round_enc(8);
142 round_enc(10);
143 round_enc(12);
144 round_enc(14);
145 add_roundkey_enc(16);
146
147 movq %r11, %rbp;
148
149 movq %r10, RIO;
150 test %cl, %cl;
151 jnz __enc_xor;
152
153 write_block();
154 ret;
155__enc_xor:
156 xor_block();
157 ret;
158
159.align 8
160.global blowfish_dec_blk
161.type blowfish_dec_blk,@function;
162
163blowfish_dec_blk:
164 /* input:
165 * %rdi: ctx, CTX
166 * %rsi: dst
167 * %rdx: src
168 */
169 movq %rbp, %r11;
170
171 movq %rsi, %r10;
172 movq %rdx, RIO;
173
174 read_block();
175
176 round_dec(17);
177 round_dec(15);
178 round_dec(13);
179 round_dec(11);
180 round_dec(9);
181 round_dec(7);
182 round_dec(5);
183 round_dec(3);
184 add_roundkey_dec(1);
185
186 movq %r10, RIO;
187 write_block();
188
189 movq %r11, %rbp;
190
191 ret;
192
193/**********************************************************************
194 4-way blowfish, four blocks parallel
195 **********************************************************************/
196
197/* F() for 4-way. Slower when used alone/1-way, but faster when used
198 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
199 */
200#define F4(x) \
201 movzbl x ## bh, RT1d; \
202 movzbl x ## bl, RT3d; \
203 rorq $16, x; \
204 movzbl x ## bh, RT0d; \
205 movzbl x ## bl, RT2d; \
206 rorq $16, x; \
207 movl s0(CTX,RT0,4), RT0d; \
208 addl s1(CTX,RT2,4), RT0d; \
209 xorl s2(CTX,RT1,4), RT0d; \
210 addl s3(CTX,RT3,4), RT0d; \
211 xorq RT0, x;
212
213#define add_preloaded_roundkey4() \
214 xorq RKEY, RX0; \
215 xorq RKEY, RX1; \
216 xorq RKEY, RX2; \
217 xorq RKEY, RX3;
218
219#define preload_roundkey_enc(n) \
220 movq p+4*(n)(CTX), RKEY;
221
222#define add_roundkey_enc4(n) \
223 add_preloaded_roundkey4(); \
224 preload_roundkey_enc(n + 2);
225
226#define round_enc4(n) \
227 add_roundkey_enc4(n); \
228 \
229 F4(RX0); \
230 F4(RX1); \
231 F4(RX2); \
232 F4(RX3); \
233 \
234 F4(RX0); \
235 F4(RX1); \
236 F4(RX2); \
237 F4(RX3);
238
239#define preload_roundkey_dec(n) \
240 movq p+4*((n)-1)(CTX), RKEY; \
241 rorq $32, RKEY;
242
243#define add_roundkey_dec4(n) \
244 add_preloaded_roundkey4(); \
245 preload_roundkey_dec(n - 2);
246
247#define round_dec4(n) \
248 add_roundkey_dec4(n); \
249 \
250 F4(RX0); \
251 F4(RX1); \
252 F4(RX2); \
253 F4(RX3); \
254 \
255 F4(RX0); \
256 F4(RX1); \
257 F4(RX2); \
258 F4(RX3);
259
260#define read_block4() \
261 movq (RIO), RX0; \
262 rorq $32, RX0; \
263 bswapq RX0; \
264 \
265 movq 8(RIO), RX1; \
266 rorq $32, RX1; \
267 bswapq RX1; \
268 \
269 movq 16(RIO), RX2; \
270 rorq $32, RX2; \
271 bswapq RX2; \
272 \
273 movq 24(RIO), RX3; \
274 rorq $32, RX3; \
275 bswapq RX3;
276
277#define write_block4() \
278 bswapq RX0; \
279 movq RX0, (RIO); \
280 \
281 bswapq RX1; \
282 movq RX1, 8(RIO); \
283 \
284 bswapq RX2; \
285 movq RX2, 16(RIO); \
286 \
287 bswapq RX3; \
288 movq RX3, 24(RIO);
289
290#define xor_block4() \
291 bswapq RX0; \
292 xorq RX0, (RIO); \
293 \
294 bswapq RX1; \
295 xorq RX1, 8(RIO); \
296 \
297 bswapq RX2; \
298 xorq RX2, 16(RIO); \
299 \
300 bswapq RX3; \
301 xorq RX3, 24(RIO);
302
303.align 8
304.global __blowfish_enc_blk_4way
305.type __blowfish_enc_blk_4way,@function;
306
307__blowfish_enc_blk_4way:
308 /* input:
309 * %rdi: ctx, CTX
310 * %rsi: dst
311 * %rdx: src
312 * %rcx: bool, if true: xor output
313 */
314 pushq %rbp;
315 pushq %rbx;
316 pushq %rcx;
317
318 preload_roundkey_enc(0);
319
320 movq %rsi, %r11;
321 movq %rdx, RIO;
322
323 read_block4();
324
325 round_enc4(0);
326 round_enc4(2);
327 round_enc4(4);
328 round_enc4(6);
329 round_enc4(8);
330 round_enc4(10);
331 round_enc4(12);
332 round_enc4(14);
333 add_preloaded_roundkey4();
334
335 popq %rbp;
336 movq %r11, RIO;
337
338 test %bpl, %bpl;
339 jnz __enc_xor4;
340
341 write_block4();
342
343 popq %rbx;
344 popq %rbp;
345 ret;
346
347__enc_xor4:
348 xor_block4();
349
350 popq %rbx;
351 popq %rbp;
352 ret;
353
354.align 8
355.global blowfish_dec_blk_4way
356.type blowfish_dec_blk_4way,@function;
357
358blowfish_dec_blk_4way:
359 /* input:
360 * %rdi: ctx, CTX
361 * %rsi: dst
362 * %rdx: src
363 */
364 pushq %rbp;
365 pushq %rbx;
366 preload_roundkey_dec(17);
367
368 movq %rsi, %r11;
369 movq %rdx, RIO;
370
371 read_block4();
372
373 round_dec4(17);
374 round_dec4(15);
375 round_dec4(13);
376 round_dec4(11);
377 round_dec4(9);
378 round_dec4(7);
379 round_dec4(5);
380 round_dec4(3);
381 add_preloaded_roundkey4();
382
383 movq %r11, RIO;
384 write_block4();
385
386 popq %rbx;
387 popq %rbp;
388
389 ret;
390
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
deleted file mode 100644
index 50ec333b70e..00000000000
--- a/arch/x86/crypto/blowfish_glue.c
+++ /dev/null
@@ -1,485 +0,0 @@
1/*
2 * Glue Code for assembler optimized version of Blowfish
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 * USA
25 *
26 */
27
28#include <asm/processor.h>
29#include <crypto/blowfish.h>
30#include <linux/crypto.h>
31#include <linux/init.h>
32#include <linux/module.h>
33#include <linux/types.h>
34#include <crypto/algapi.h>
35
36/* regular block cipher functions */
37asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
38 bool xor);
39asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
40
41/* 4-way parallel cipher functions */
42asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
43 const u8 *src, bool xor);
44asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
45 const u8 *src);
46
47static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
48{
49 __blowfish_enc_blk(ctx, dst, src, false);
50}
51
52static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
53 const u8 *src)
54{
55 __blowfish_enc_blk(ctx, dst, src, true);
56}
57
58static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
59 const u8 *src)
60{
61 __blowfish_enc_blk_4way(ctx, dst, src, false);
62}
63
64static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
65 const u8 *src)
66{
67 __blowfish_enc_blk_4way(ctx, dst, src, true);
68}
69
70static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
71{
72 blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
73}
74
75static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
76{
77 blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
78}
79
80static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
81 void (*fn)(struct bf_ctx *, u8 *, const u8 *),
82 void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
83{
84 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
85 unsigned int bsize = BF_BLOCK_SIZE;
86 unsigned int nbytes;
87 int err;
88
89 err = blkcipher_walk_virt(desc, walk);
90
91 while ((nbytes = walk->nbytes)) {
92 u8 *wsrc = walk->src.virt.addr;
93 u8 *wdst = walk->dst.virt.addr;
94
95 /* Process four block batch */
96 if (nbytes >= bsize * 4) {
97 do {
98 fn_4way(ctx, wdst, wsrc);
99
100 wsrc += bsize * 4;
101 wdst += bsize * 4;
102 nbytes -= bsize * 4;
103 } while (nbytes >= bsize * 4);
104
105 if (nbytes < bsize)
106 goto done;
107 }
108
109 /* Handle leftovers */
110 do {
111 fn(ctx, wdst, wsrc);
112
113 wsrc += bsize;
114 wdst += bsize;
115 nbytes -= bsize;
116 } while (nbytes >= bsize);
117
118done:
119 err = blkcipher_walk_done(desc, walk, nbytes);
120 }
121
122 return err;
123}
124
125static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
126 struct scatterlist *src, unsigned int nbytes)
127{
128 struct blkcipher_walk walk;
129
130 blkcipher_walk_init(&walk, dst, src, nbytes);
131 return ecb_crypt(desc, &walk, blowfish_enc_blk, blowfish_enc_blk_4way);
132}
133
134static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
135 struct scatterlist *src, unsigned int nbytes)
136{
137 struct blkcipher_walk walk;
138
139 blkcipher_walk_init(&walk, dst, src, nbytes);
140 return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way);
141}
142
143static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
144 struct blkcipher_walk *walk)
145{
146 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
147 unsigned int bsize = BF_BLOCK_SIZE;
148 unsigned int nbytes = walk->nbytes;
149 u64 *src = (u64 *)walk->src.virt.addr;
150 u64 *dst = (u64 *)walk->dst.virt.addr;
151 u64 *iv = (u64 *)walk->iv;
152
153 do {
154 *dst = *src ^ *iv;
155 blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
156 iv = dst;
157
158 src += 1;
159 dst += 1;
160 nbytes -= bsize;
161 } while (nbytes >= bsize);
162
163 *(u64 *)walk->iv = *iv;
164 return nbytes;
165}
166
167static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
168 struct scatterlist *src, unsigned int nbytes)
169{
170 struct blkcipher_walk walk;
171 int err;
172
173 blkcipher_walk_init(&walk, dst, src, nbytes);
174 err = blkcipher_walk_virt(desc, &walk);
175
176 while ((nbytes = walk.nbytes)) {
177 nbytes = __cbc_encrypt(desc, &walk);
178 err = blkcipher_walk_done(desc, &walk, nbytes);
179 }
180
181 return err;
182}
183
184static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
185 struct blkcipher_walk *walk)
186{
187 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
188 unsigned int bsize = BF_BLOCK_SIZE;
189 unsigned int nbytes = walk->nbytes;
190 u64 *src = (u64 *)walk->src.virt.addr;
191 u64 *dst = (u64 *)walk->dst.virt.addr;
192 u64 ivs[4 - 1];
193 u64 last_iv;
194
195 /* Start of the last block. */
196 src += nbytes / bsize - 1;
197 dst += nbytes / bsize - 1;
198
199 last_iv = *src;
200
201 /* Process four block batch */
202 if (nbytes >= bsize * 4) {
203 do {
204 nbytes -= bsize * 4 - bsize;
205 src -= 4 - 1;
206 dst -= 4 - 1;
207
208 ivs[0] = src[0];
209 ivs[1] = src[1];
210 ivs[2] = src[2];
211
212 blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
213
214 dst[1] ^= ivs[0];
215 dst[2] ^= ivs[1];
216 dst[3] ^= ivs[2];
217
218 nbytes -= bsize;
219 if (nbytes < bsize)
220 goto done;
221
222 *dst ^= *(src - 1);
223 src -= 1;
224 dst -= 1;
225 } while (nbytes >= bsize * 4);
226
227 if (nbytes < bsize)
228 goto done;
229 }
230
231 /* Handle leftovers */
232 for (;;) {
233 blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
234
235 nbytes -= bsize;
236 if (nbytes < bsize)
237 break;
238
239 *dst ^= *(src - 1);
240 src -= 1;
241 dst -= 1;
242 }
243
244done:
245 *dst ^= *(u64 *)walk->iv;
246 *(u64 *)walk->iv = last_iv;
247
248 return nbytes;
249}
250
251static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
252 struct scatterlist *src, unsigned int nbytes)
253{
254 struct blkcipher_walk walk;
255 int err;
256
257 blkcipher_walk_init(&walk, dst, src, nbytes);
258 err = blkcipher_walk_virt(desc, &walk);
259
260 while ((nbytes = walk.nbytes)) {
261 nbytes = __cbc_decrypt(desc, &walk);
262 err = blkcipher_walk_done(desc, &walk, nbytes);
263 }
264
265 return err;
266}
267
268static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
269{
270 u8 *ctrblk = walk->iv;
271 u8 keystream[BF_BLOCK_SIZE];
272 u8 *src = walk->src.virt.addr;
273 u8 *dst = walk->dst.virt.addr;
274 unsigned int nbytes = walk->nbytes;
275
276 blowfish_enc_blk(ctx, keystream, ctrblk);
277 crypto_xor(keystream, src, nbytes);
278 memcpy(dst, keystream, nbytes);
279
280 crypto_inc(ctrblk, BF_BLOCK_SIZE);
281}
282
283static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
284 struct blkcipher_walk *walk)
285{
286 struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
287 unsigned int bsize = BF_BLOCK_SIZE;
288 unsigned int nbytes = walk->nbytes;
289 u64 *src = (u64 *)walk->src.virt.addr;
290 u64 *dst = (u64 *)walk->dst.virt.addr;
291 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
292 __be64 ctrblocks[4];
293
294 /* Process four block batch */
295 if (nbytes >= bsize * 4) {
296 do {
297 if (dst != src) {
298 dst[0] = src[0];
299 dst[1] = src[1];
300 dst[2] = src[2];
301 dst[3] = src[3];
302 }
303
304 /* create ctrblks for parallel encrypt */
305 ctrblocks[0] = cpu_to_be64(ctrblk++);
306 ctrblocks[1] = cpu_to_be64(ctrblk++);
307 ctrblocks[2] = cpu_to_be64(ctrblk++);
308 ctrblocks[3] = cpu_to_be64(ctrblk++);
309
310 blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
311 (u8 *)ctrblocks);
312
313 src += 4;
314 dst += 4;
315 } while ((nbytes -= bsize * 4) >= bsize * 4);
316
317 if (nbytes < bsize)
318 goto done;
319 }
320
321 /* Handle leftovers */
322 do {
323 if (dst != src)
324 *dst = *src;
325
326 ctrblocks[0] = cpu_to_be64(ctrblk++);
327
328 blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
329
330 src += 1;
331 dst += 1;
332 } while ((nbytes -= bsize) >= bsize);
333
334done:
335 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
336 return nbytes;
337}
338
339static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
340 struct scatterlist *src, unsigned int nbytes)
341{
342 struct blkcipher_walk walk;
343 int err;
344
345 blkcipher_walk_init(&walk, dst, src, nbytes);
346 err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
347
348 while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
349 nbytes = __ctr_crypt(desc, &walk);
350 err = blkcipher_walk_done(desc, &walk, nbytes);
351 }
352
353 if (walk.nbytes) {
354 ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
355 err = blkcipher_walk_done(desc, &walk, 0);
356 }
357
358 return err;
359}
360
361static struct crypto_alg bf_algs[4] = { {
362 .cra_name = "blowfish",
363 .cra_driver_name = "blowfish-asm",
364 .cra_priority = 200,
365 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
366 .cra_blocksize = BF_BLOCK_SIZE,
367 .cra_ctxsize = sizeof(struct bf_ctx),
368 .cra_alignmask = 0,
369 .cra_module = THIS_MODULE,
370 .cra_u = {
371 .cipher = {
372 .cia_min_keysize = BF_MIN_KEY_SIZE,
373 .cia_max_keysize = BF_MAX_KEY_SIZE,
374 .cia_setkey = blowfish_setkey,
375 .cia_encrypt = blowfish_encrypt,
376 .cia_decrypt = blowfish_decrypt,
377 }
378 }
379}, {
380 .cra_name = "ecb(blowfish)",
381 .cra_driver_name = "ecb-blowfish-asm",
382 .cra_priority = 300,
383 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
384 .cra_blocksize = BF_BLOCK_SIZE,
385 .cra_ctxsize = sizeof(struct bf_ctx),
386 .cra_alignmask = 0,
387 .cra_type = &crypto_blkcipher_type,
388 .cra_module = THIS_MODULE,
389 .cra_u = {
390 .blkcipher = {
391 .min_keysize = BF_MIN_KEY_SIZE,
392 .max_keysize = BF_MAX_KEY_SIZE,
393 .setkey = blowfish_setkey,
394 .encrypt = ecb_encrypt,
395 .decrypt = ecb_decrypt,
396 },
397 },
398}, {
399 .cra_name = "cbc(blowfish)",
400 .cra_driver_name = "cbc-blowfish-asm",
401 .cra_priority = 300,
402 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
403 .cra_blocksize = BF_BLOCK_SIZE,
404 .cra_ctxsize = sizeof(struct bf_ctx),
405 .cra_alignmask = 0,
406 .cra_type = &crypto_blkcipher_type,
407 .cra_module = THIS_MODULE,
408 .cra_u = {
409 .blkcipher = {
410 .min_keysize = BF_MIN_KEY_SIZE,
411 .max_keysize = BF_MAX_KEY_SIZE,
412 .ivsize = BF_BLOCK_SIZE,
413 .setkey = blowfish_setkey,
414 .encrypt = cbc_encrypt,
415 .decrypt = cbc_decrypt,
416 },
417 },
418}, {
419 .cra_name = "ctr(blowfish)",
420 .cra_driver_name = "ctr-blowfish-asm",
421 .cra_priority = 300,
422 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
423 .cra_blocksize = 1,
424 .cra_ctxsize = sizeof(struct bf_ctx),
425 .cra_alignmask = 0,
426 .cra_type = &crypto_blkcipher_type,
427 .cra_module = THIS_MODULE,
428 .cra_u = {
429 .blkcipher = {
430 .min_keysize = BF_MIN_KEY_SIZE,
431 .max_keysize = BF_MAX_KEY_SIZE,
432 .ivsize = BF_BLOCK_SIZE,
433 .setkey = blowfish_setkey,
434 .encrypt = ctr_crypt,
435 .decrypt = ctr_crypt,
436 },
437 },
438} };
439
440static bool is_blacklisted_cpu(void)
441{
442 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
443 return false;
444
445 if (boot_cpu_data.x86 == 0x0f) {
446 /*
447 * On Pentium 4, blowfish-x86_64 is slower than generic C
448 * implementation because use of 64bit rotates (which are really
449 * slow on P4). Therefore blacklist P4s.
450 */
451 return true;
452 }
453
454 return false;
455}
456
457static int force;
458module_param(force, int, 0);
459MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
460
461static int __init init(void)
462{
463 if (!force && is_blacklisted_cpu()) {
464 printk(KERN_INFO
465 "blowfish-x86_64: performance on this CPU "
466 "would be suboptimal: disabling "
467 "blowfish-x86_64.\n");
468 return -ENODEV;
469 }
470
471 return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
472}
473
474static void __exit fini(void)
475{
476 crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
477}
478
479module_init(init);
480module_exit(fini);
481
482MODULE_LICENSE("GPL");
483MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
484MODULE_ALIAS("blowfish");
485MODULE_ALIAS("blowfish-asm");
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
deleted file mode 100644
index 2306d2e4816..00000000000
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ /dev/null
@@ -1,1102 +0,0 @@
1/*
2 * x86_64/AVX/AES-NI assembler implementation of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13/*
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
16 */
17
18#define CAMELLIA_TABLE_BYTE_LEN 272
19
20/* struct camellia_ctx: */
21#define key_table 0
22#define key_length CAMELLIA_TABLE_BYTE_LEN
23
24/* register macros */
25#define CTX %rdi
26
27/**********************************************************************
28 16-way camellia
29 **********************************************************************/
30#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
31 vpand x, mask4bit, tmp0; \
32 vpandn x, mask4bit, x; \
33 vpsrld $4, x, x; \
34 \
35 vpshufb tmp0, lo_t, tmp0; \
36 vpshufb x, hi_t, x; \
37 vpxor tmp0, x, x;
38
39/*
40 * IN:
41 * x0..x7: byte-sliced AB state
42 * mem_cd: register pointer storing CD state
43 * key: index for key material
44 * OUT:
45 * x0..x7: new byte-sliced CD state
46 */
47#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
48 t7, mem_cd, key) \
49 /* \
50 * S-function with AES subbytes \
51 */ \
52 vmovdqa .Linv_shift_row, t4; \
53 vbroadcastss .L0f0f0f0f, t7; \
54 vmovdqa .Lpre_tf_lo_s1, t0; \
55 vmovdqa .Lpre_tf_hi_s1, t1; \
56 \
57 /* AES inverse shift rows */ \
58 vpshufb t4, x0, x0; \
59 vpshufb t4, x7, x7; \
60 vpshufb t4, x1, x1; \
61 vpshufb t4, x4, x4; \
62 vpshufb t4, x2, x2; \
63 vpshufb t4, x5, x5; \
64 vpshufb t4, x3, x3; \
65 vpshufb t4, x6, x6; \
66 \
67 /* prefilter sboxes 1, 2 and 3 */ \
68 vmovdqa .Lpre_tf_lo_s4, t2; \
69 vmovdqa .Lpre_tf_hi_s4, t3; \
70 filter_8bit(x0, t0, t1, t7, t6); \
71 filter_8bit(x7, t0, t1, t7, t6); \
72 filter_8bit(x1, t0, t1, t7, t6); \
73 filter_8bit(x4, t0, t1, t7, t6); \
74 filter_8bit(x2, t0, t1, t7, t6); \
75 filter_8bit(x5, t0, t1, t7, t6); \
76 \
77 /* prefilter sbox 4 */ \
78 vpxor t4, t4, t4; \
79 filter_8bit(x3, t2, t3, t7, t6); \
80 filter_8bit(x6, t2, t3, t7, t6); \
81 \
82 /* AES subbytes + AES shift rows */ \
83 vmovdqa .Lpost_tf_lo_s1, t0; \
84 vmovdqa .Lpost_tf_hi_s1, t1; \
85 vaesenclast t4, x0, x0; \
86 vaesenclast t4, x7, x7; \
87 vaesenclast t4, x1, x1; \
88 vaesenclast t4, x4, x4; \
89 vaesenclast t4, x2, x2; \
90 vaesenclast t4, x5, x5; \
91 vaesenclast t4, x3, x3; \
92 vaesenclast t4, x6, x6; \
93 \
94 /* postfilter sboxes 1 and 4 */ \
95 vmovdqa .Lpost_tf_lo_s3, t2; \
96 vmovdqa .Lpost_tf_hi_s3, t3; \
97 filter_8bit(x0, t0, t1, t7, t6); \
98 filter_8bit(x7, t0, t1, t7, t6); \
99 filter_8bit(x3, t0, t1, t7, t6); \
100 filter_8bit(x6, t0, t1, t7, t6); \
101 \
102 /* postfilter sbox 3 */ \
103 vmovdqa .Lpost_tf_lo_s2, t4; \
104 vmovdqa .Lpost_tf_hi_s2, t5; \
105 filter_8bit(x2, t2, t3, t7, t6); \
106 filter_8bit(x5, t2, t3, t7, t6); \
107 \
108 vpxor t6, t6, t6; \
109 vmovq key, t0; \
110 \
111 /* postfilter sbox 2 */ \
112 filter_8bit(x1, t4, t5, t7, t2); \
113 filter_8bit(x4, t4, t5, t7, t2); \
114 \
115 vpsrldq $5, t0, t5; \
116 vpsrldq $1, t0, t1; \
117 vpsrldq $2, t0, t2; \
118 vpsrldq $3, t0, t3; \
119 vpsrldq $4, t0, t4; \
120 vpshufb t6, t0, t0; \
121 vpshufb t6, t1, t1; \
122 vpshufb t6, t2, t2; \
123 vpshufb t6, t3, t3; \
124 vpshufb t6, t4, t4; \
125 vpsrldq $2, t5, t7; \
126 vpshufb t6, t7, t7; \
127 \
128 /* \
129 * P-function \
130 */ \
131 vpxor x5, x0, x0; \
132 vpxor x6, x1, x1; \
133 vpxor x7, x2, x2; \
134 vpxor x4, x3, x3; \
135 \
136 vpxor x2, x4, x4; \
137 vpxor x3, x5, x5; \
138 vpxor x0, x6, x6; \
139 vpxor x1, x7, x7; \
140 \
141 vpxor x7, x0, x0; \
142 vpxor x4, x1, x1; \
143 vpxor x5, x2, x2; \
144 vpxor x6, x3, x3; \
145 \
146 vpxor x3, x4, x4; \
147 vpxor x0, x5, x5; \
148 vpxor x1, x6, x6; \
149 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
150 \
151 /* \
152 * Add key material and result to CD (x becomes new CD) \
153 */ \
154 \
155 vpxor t3, x4, x4; \
156 vpxor 0 * 16(mem_cd), x4, x4; \
157 \
158 vpxor t2, x5, x5; \
159 vpxor 1 * 16(mem_cd), x5, x5; \
160 \
161 vpsrldq $1, t5, t3; \
162 vpshufb t6, t5, t5; \
163 vpshufb t6, t3, t6; \
164 \
165 vpxor t1, x6, x6; \
166 vpxor 2 * 16(mem_cd), x6, x6; \
167 \
168 vpxor t0, x7, x7; \
169 vpxor 3 * 16(mem_cd), x7, x7; \
170 \
171 vpxor t7, x0, x0; \
172 vpxor 4 * 16(mem_cd), x0, x0; \
173 \
174 vpxor t6, x1, x1; \
175 vpxor 5 * 16(mem_cd), x1, x1; \
176 \
177 vpxor t5, x2, x2; \
178 vpxor 6 * 16(mem_cd), x2, x2; \
179 \
180 vpxor t4, x3, x3; \
181 vpxor 7 * 16(mem_cd), x3, x3;
182
183/*
184 * Size optimization... with inlined roundsm16, binary would be over 5 times
185 * larger and would only be 0.5% faster (on sandy-bridge).
186 */
187.align 8
188roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
189 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
190 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
191 %rcx, (%r9));
192 ret;
193
194.align 8
195roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
196 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
197 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
198 %rax, (%r9));
199 ret;
200
201/*
202 * IN/OUT:
203 * x0..x7: byte-sliced AB state preloaded
204 * mem_ab: byte-sliced AB state in memory
205 * mem_cb: byte-sliced CD state in memory
206 */
207#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
208 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
209 leaq (key_table + (i) * 8)(CTX), %r9; \
210 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
211 \
212 vmovdqu x4, 0 * 16(mem_cd); \
213 vmovdqu x5, 1 * 16(mem_cd); \
214 vmovdqu x6, 2 * 16(mem_cd); \
215 vmovdqu x7, 3 * 16(mem_cd); \
216 vmovdqu x0, 4 * 16(mem_cd); \
217 vmovdqu x1, 5 * 16(mem_cd); \
218 vmovdqu x2, 6 * 16(mem_cd); \
219 vmovdqu x3, 7 * 16(mem_cd); \
220 \
221 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
222 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
223 \
224 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
225
226#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
227
228#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
229 /* Store new AB state */ \
230 vmovdqu x0, 0 * 16(mem_ab); \
231 vmovdqu x1, 1 * 16(mem_ab); \
232 vmovdqu x2, 2 * 16(mem_ab); \
233 vmovdqu x3, 3 * 16(mem_ab); \
234 vmovdqu x4, 4 * 16(mem_ab); \
235 vmovdqu x5, 5 * 16(mem_ab); \
236 vmovdqu x6, 6 * 16(mem_ab); \
237 vmovdqu x7, 7 * 16(mem_ab);
238
239#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
240 y6, y7, mem_ab, mem_cd, i) \
241 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
242 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
243 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
245 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
247
248#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
249 y6, y7, mem_ab, mem_cd, i) \
250 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
252 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
254 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
256
257/*
258 * IN:
259 * v0..3: byte-sliced 32-bit integers
260 * OUT:
261 * v0..3: (IN <<< 1)
262 */
263#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
264 vpcmpgtb v0, zero, t0; \
265 vpaddb v0, v0, v0; \
266 vpabsb t0, t0; \
267 \
268 vpcmpgtb v1, zero, t1; \
269 vpaddb v1, v1, v1; \
270 vpabsb t1, t1; \
271 \
272 vpcmpgtb v2, zero, t2; \
273 vpaddb v2, v2, v2; \
274 vpabsb t2, t2; \
275 \
276 vpor t0, v1, v1; \
277 \
278 vpcmpgtb v3, zero, t0; \
279 vpaddb v3, v3, v3; \
280 vpabsb t0, t0; \
281 \
282 vpor t1, v2, v2; \
283 vpor t2, v3, v3; \
284 vpor t0, v0, v0;
285
286/*
287 * IN:
288 * r: byte-sliced AB state in memory
289 * l: byte-sliced CD state in memory
290 * OUT:
291 * x0..x7: new byte-sliced CD state
292 */
293#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
294 tt1, tt2, tt3, kll, klr, krl, krr) \
295 /* \
296 * t0 = kll; \
297 * t0 &= ll; \
298 * lr ^= rol32(t0, 1); \
299 */ \
300 vpxor tt0, tt0, tt0; \
301 vmovd kll, t0; \
302 vpshufb tt0, t0, t3; \
303 vpsrldq $1, t0, t0; \
304 vpshufb tt0, t0, t2; \
305 vpsrldq $1, t0, t0; \
306 vpshufb tt0, t0, t1; \
307 vpsrldq $1, t0, t0; \
308 vpshufb tt0, t0, t0; \
309 \
310 vpand l0, t0, t0; \
311 vpand l1, t1, t1; \
312 vpand l2, t2, t2; \
313 vpand l3, t3, t3; \
314 \
315 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
316 \
317 vpxor l4, t0, l4; \
318 vmovdqu l4, 4 * 16(l); \
319 vpxor l5, t1, l5; \
320 vmovdqu l5, 5 * 16(l); \
321 vpxor l6, t2, l6; \
322 vmovdqu l6, 6 * 16(l); \
323 vpxor l7, t3, l7; \
324 vmovdqu l7, 7 * 16(l); \
325 \
326 /* \
327 * t2 = krr; \
328 * t2 |= rr; \
329 * rl ^= t2; \
330 */ \
331 \
332 vmovd krr, t0; \
333 vpshufb tt0, t0, t3; \
334 vpsrldq $1, t0, t0; \
335 vpshufb tt0, t0, t2; \
336 vpsrldq $1, t0, t0; \
337 vpshufb tt0, t0, t1; \
338 vpsrldq $1, t0, t0; \
339 vpshufb tt0, t0, t0; \
340 \
341 vpor 4 * 16(r), t0, t0; \
342 vpor 5 * 16(r), t1, t1; \
343 vpor 6 * 16(r), t2, t2; \
344 vpor 7 * 16(r), t3, t3; \
345 \
346 vpxor 0 * 16(r), t0, t0; \
347 vpxor 1 * 16(r), t1, t1; \
348 vpxor 2 * 16(r), t2, t2; \
349 vpxor 3 * 16(r), t3, t3; \
350 vmovdqu t0, 0 * 16(r); \
351 vmovdqu t1, 1 * 16(r); \
352 vmovdqu t2, 2 * 16(r); \
353 vmovdqu t3, 3 * 16(r); \
354 \
355 /* \
356 * t2 = krl; \
357 * t2 &= rl; \
358 * rr ^= rol32(t2, 1); \
359 */ \
360 vmovd krl, t0; \
361 vpshufb tt0, t0, t3; \
362 vpsrldq $1, t0, t0; \
363 vpshufb tt0, t0, t2; \
364 vpsrldq $1, t0, t0; \
365 vpshufb tt0, t0, t1; \
366 vpsrldq $1, t0, t0; \
367 vpshufb tt0, t0, t0; \
368 \
369 vpand 0 * 16(r), t0, t0; \
370 vpand 1 * 16(r), t1, t1; \
371 vpand 2 * 16(r), t2, t2; \
372 vpand 3 * 16(r), t3, t3; \
373 \
374 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
375 \
376 vpxor 4 * 16(r), t0, t0; \
377 vpxor 5 * 16(r), t1, t1; \
378 vpxor 6 * 16(r), t2, t2; \
379 vpxor 7 * 16(r), t3, t3; \
380 vmovdqu t0, 4 * 16(r); \
381 vmovdqu t1, 5 * 16(r); \
382 vmovdqu t2, 6 * 16(r); \
383 vmovdqu t3, 7 * 16(r); \
384 \
385 /* \
386 * t0 = klr; \
387 * t0 |= lr; \
388 * ll ^= t0; \
389 */ \
390 \
391 vmovd klr, t0; \
392 vpshufb tt0, t0, t3; \
393 vpsrldq $1, t0, t0; \
394 vpshufb tt0, t0, t2; \
395 vpsrldq $1, t0, t0; \
396 vpshufb tt0, t0, t1; \
397 vpsrldq $1, t0, t0; \
398 vpshufb tt0, t0, t0; \
399 \
400 vpor l4, t0, t0; \
401 vpor l5, t1, t1; \
402 vpor l6, t2, t2; \
403 vpor l7, t3, t3; \
404 \
405 vpxor l0, t0, l0; \
406 vmovdqu l0, 0 * 16(l); \
407 vpxor l1, t1, l1; \
408 vmovdqu l1, 1 * 16(l); \
409 vpxor l2, t2, l2; \
410 vmovdqu l2, 2 * 16(l); \
411 vpxor l3, t3, l3; \
412 vmovdqu l3, 3 * 16(l);
413
414#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
415 vpunpckhdq x1, x0, t2; \
416 vpunpckldq x1, x0, x0; \
417 \
418 vpunpckldq x3, x2, t1; \
419 vpunpckhdq x3, x2, x2; \
420 \
421 vpunpckhqdq t1, x0, x1; \
422 vpunpcklqdq t1, x0, x0; \
423 \
424 vpunpckhqdq x2, t2, x3; \
425 vpunpcklqdq x2, t2, x2;
426
427#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
428 b3, c3, d3, st0, st1) \
429 vmovdqu d2, st0; \
430 vmovdqu d3, st1; \
431 transpose_4x4(a0, a1, a2, a3, d2, d3); \
432 transpose_4x4(b0, b1, b2, b3, d2, d3); \
433 vmovdqu st0, d2; \
434 vmovdqu st1, d3; \
435 \
436 vmovdqu a0, st0; \
437 vmovdqu a1, st1; \
438 transpose_4x4(c0, c1, c2, c3, a0, a1); \
439 transpose_4x4(d0, d1, d2, d3, a0, a1); \
440 \
441 vmovdqu .Lshufb_16x16b, a0; \
442 vmovdqu st1, a1; \
443 vpshufb a0, a2, a2; \
444 vpshufb a0, a3, a3; \
445 vpshufb a0, b0, b0; \
446 vpshufb a0, b1, b1; \
447 vpshufb a0, b2, b2; \
448 vpshufb a0, b3, b3; \
449 vpshufb a0, a1, a1; \
450 vpshufb a0, c0, c0; \
451 vpshufb a0, c1, c1; \
452 vpshufb a0, c2, c2; \
453 vpshufb a0, c3, c3; \
454 vpshufb a0, d0, d0; \
455 vpshufb a0, d1, d1; \
456 vpshufb a0, d2, d2; \
457 vpshufb a0, d3, d3; \
458 vmovdqu d3, st1; \
459 vmovdqu st0, d3; \
460 vpshufb a0, d3, a0; \
461 vmovdqu d2, st0; \
462 \
463 transpose_4x4(a0, b0, c0, d0, d2, d3); \
464 transpose_4x4(a1, b1, c1, d1, d2, d3); \
465 vmovdqu st0, d2; \
466 vmovdqu st1, d3; \
467 \
468 vmovdqu b0, st0; \
469 vmovdqu b1, st1; \
470 transpose_4x4(a2, b2, c2, d2, b0, b1); \
471 transpose_4x4(a3, b3, c3, d3, b0, b1); \
472 vmovdqu st0, b0; \
473 vmovdqu st1, b1; \
474 /* does not adjust output bytes inside vectors */
475
476/* load blocks to registers and apply pre-whitening */
477#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
478 y6, y7, rio, key) \
479 vmovq key, x0; \
480 vpshufb .Lpack_bswap, x0, x0; \
481 \
482 vpxor 0 * 16(rio), x0, y7; \
483 vpxor 1 * 16(rio), x0, y6; \
484 vpxor 2 * 16(rio), x0, y5; \
485 vpxor 3 * 16(rio), x0, y4; \
486 vpxor 4 * 16(rio), x0, y3; \
487 vpxor 5 * 16(rio), x0, y2; \
488 vpxor 6 * 16(rio), x0, y1; \
489 vpxor 7 * 16(rio), x0, y0; \
490 vpxor 8 * 16(rio), x0, x7; \
491 vpxor 9 * 16(rio), x0, x6; \
492 vpxor 10 * 16(rio), x0, x5; \
493 vpxor 11 * 16(rio), x0, x4; \
494 vpxor 12 * 16(rio), x0, x3; \
495 vpxor 13 * 16(rio), x0, x2; \
496 vpxor 14 * 16(rio), x0, x1; \
497 vpxor 15 * 16(rio), x0, x0;
498
499/* byteslice pre-whitened blocks and store to temporary memory */
500#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
501 y6, y7, mem_ab, mem_cd) \
502 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
503 y5, y6, y7, (mem_ab), (mem_cd)); \
504 \
505 vmovdqu x0, 0 * 16(mem_ab); \
506 vmovdqu x1, 1 * 16(mem_ab); \
507 vmovdqu x2, 2 * 16(mem_ab); \
508 vmovdqu x3, 3 * 16(mem_ab); \
509 vmovdqu x4, 4 * 16(mem_ab); \
510 vmovdqu x5, 5 * 16(mem_ab); \
511 vmovdqu x6, 6 * 16(mem_ab); \
512 vmovdqu x7, 7 * 16(mem_ab); \
513 vmovdqu y0, 0 * 16(mem_cd); \
514 vmovdqu y1, 1 * 16(mem_cd); \
515 vmovdqu y2, 2 * 16(mem_cd); \
516 vmovdqu y3, 3 * 16(mem_cd); \
517 vmovdqu y4, 4 * 16(mem_cd); \
518 vmovdqu y5, 5 * 16(mem_cd); \
519 vmovdqu y6, 6 * 16(mem_cd); \
520 vmovdqu y7, 7 * 16(mem_cd);
521
522/* de-byteslice, apply post-whitening and store blocks */
523#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
524 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
525 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
526 y7, x3, x7, stack_tmp0, stack_tmp1); \
527 \
528 vmovdqu x0, stack_tmp0; \
529 \
530 vmovq key, x0; \
531 vpshufb .Lpack_bswap, x0, x0; \
532 \
533 vpxor x0, y7, y7; \
534 vpxor x0, y6, y6; \
535 vpxor x0, y5, y5; \
536 vpxor x0, y4, y4; \
537 vpxor x0, y3, y3; \
538 vpxor x0, y2, y2; \
539 vpxor x0, y1, y1; \
540 vpxor x0, y0, y0; \
541 vpxor x0, x7, x7; \
542 vpxor x0, x6, x6; \
543 vpxor x0, x5, x5; \
544 vpxor x0, x4, x4; \
545 vpxor x0, x3, x3; \
546 vpxor x0, x2, x2; \
547 vpxor x0, x1, x1; \
548 vpxor stack_tmp0, x0, x0;
549
550#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
551 y6, y7, rio) \
552 vmovdqu x0, 0 * 16(rio); \
553 vmovdqu x1, 1 * 16(rio); \
554 vmovdqu x2, 2 * 16(rio); \
555 vmovdqu x3, 3 * 16(rio); \
556 vmovdqu x4, 4 * 16(rio); \
557 vmovdqu x5, 5 * 16(rio); \
558 vmovdqu x6, 6 * 16(rio); \
559 vmovdqu x7, 7 * 16(rio); \
560 vmovdqu y0, 8 * 16(rio); \
561 vmovdqu y1, 9 * 16(rio); \
562 vmovdqu y2, 10 * 16(rio); \
563 vmovdqu y3, 11 * 16(rio); \
564 vmovdqu y4, 12 * 16(rio); \
565 vmovdqu y5, 13 * 16(rio); \
566 vmovdqu y6, 14 * 16(rio); \
567 vmovdqu y7, 15 * 16(rio);
568
569.data
570.align 16
571
572#define SHUFB_BYTES(idx) \
573 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
574
575.Lshufb_16x16b:
576 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
577
578.Lpack_bswap:
579 .long 0x00010203
580 .long 0x04050607
581 .long 0x80808080
582 .long 0x80808080
583
584/* For CTR-mode IV byteswap */
585.Lbswap128_mask:
586 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
587
588/*
589 * pre-SubByte transform
590 *
591 * pre-lookup for sbox1, sbox2, sbox3:
592 * swap_bitendianness(
593 * isom_map_camellia_to_aes(
594 * camellia_f(
595 * swap_bitendianess(in)
596 * )
597 * )
598 * )
599 *
600 * (note: '⊕ 0xc5' inside camellia_f())
601 */
602.Lpre_tf_lo_s1:
603 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
604 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
605.Lpre_tf_hi_s1:
606 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
607 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
608
609/*
610 * pre-SubByte transform
611 *
612 * pre-lookup for sbox4:
613 * swap_bitendianness(
614 * isom_map_camellia_to_aes(
615 * camellia_f(
616 * swap_bitendianess(in <<< 1)
617 * )
618 * )
619 * )
620 *
621 * (note: '⊕ 0xc5' inside camellia_f())
622 */
623.Lpre_tf_lo_s4:
624 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
625 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
626.Lpre_tf_hi_s4:
627 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
628 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
629
630/*
631 * post-SubByte transform
632 *
633 * post-lookup for sbox1, sbox4:
634 * swap_bitendianness(
635 * camellia_h(
636 * isom_map_aes_to_camellia(
637 * swap_bitendianness(
638 * aes_inverse_affine_transform(in)
639 * )
640 * )
641 * )
642 * )
643 *
644 * (note: '⊕ 0x6e' inside camellia_h())
645 */
646.Lpost_tf_lo_s1:
647 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
648 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
649.Lpost_tf_hi_s1:
650 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
651 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
652
653/*
654 * post-SubByte transform
655 *
656 * post-lookup for sbox2:
657 * swap_bitendianness(
658 * camellia_h(
659 * isom_map_aes_to_camellia(
660 * swap_bitendianness(
661 * aes_inverse_affine_transform(in)
662 * )
663 * )
664 * )
665 * ) <<< 1
666 *
667 * (note: '⊕ 0x6e' inside camellia_h())
668 */
669.Lpost_tf_lo_s2:
670 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
671 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
672.Lpost_tf_hi_s2:
673 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
674 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
675
676/*
677 * post-SubByte transform
678 *
679 * post-lookup for sbox3:
680 * swap_bitendianness(
681 * camellia_h(
682 * isom_map_aes_to_camellia(
683 * swap_bitendianness(
684 * aes_inverse_affine_transform(in)
685 * )
686 * )
687 * )
688 * ) >>> 1
689 *
690 * (note: '⊕ 0x6e' inside camellia_h())
691 */
692.Lpost_tf_lo_s3:
693 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
694 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
695.Lpost_tf_hi_s3:
696 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
697 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
698
699/* For isolating SubBytes from AESENCLAST, inverse shift row */
700.Linv_shift_row:
701 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
702 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
703
704/* 4-bit mask */
705.align 4
706.L0f0f0f0f:
707 .long 0x0f0f0f0f
708
709.text
710
711.align 8
712.type __camellia_enc_blk16,@function;
713
714__camellia_enc_blk16:
715 /* input:
716 * %rdi: ctx, CTX
717 * %rax: temporary storage, 256 bytes
718 * %xmm0..%xmm15: 16 plaintext blocks
719 * output:
720 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
721 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
722 */
723
724 leaq 8 * 16(%rax), %rcx;
725
726 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
727 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
728 %xmm15, %rax, %rcx);
729
730 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
731 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
732 %xmm15, %rax, %rcx, 0);
733
734 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
735 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
736 %xmm15,
737 ((key_table + (8) * 8) + 0)(CTX),
738 ((key_table + (8) * 8) + 4)(CTX),
739 ((key_table + (8) * 8) + 8)(CTX),
740 ((key_table + (8) * 8) + 12)(CTX));
741
742 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
743 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
744 %xmm15, %rax, %rcx, 8);
745
746 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
747 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748 %xmm15,
749 ((key_table + (16) * 8) + 0)(CTX),
750 ((key_table + (16) * 8) + 4)(CTX),
751 ((key_table + (16) * 8) + 8)(CTX),
752 ((key_table + (16) * 8) + 12)(CTX));
753
754 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
755 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
756 %xmm15, %rax, %rcx, 16);
757
758 movl $24, %r8d;
759 cmpl $16, key_length(CTX);
760 jne .Lenc_max32;
761
762.Lenc_done:
763 /* load CD for output */
764 vmovdqu 0 * 16(%rcx), %xmm8;
765 vmovdqu 1 * 16(%rcx), %xmm9;
766 vmovdqu 2 * 16(%rcx), %xmm10;
767 vmovdqu 3 * 16(%rcx), %xmm11;
768 vmovdqu 4 * 16(%rcx), %xmm12;
769 vmovdqu 5 * 16(%rcx), %xmm13;
770 vmovdqu 6 * 16(%rcx), %xmm14;
771 vmovdqu 7 * 16(%rcx), %xmm15;
772
773 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
774 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
775 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
776
777 ret;
778
779.align 8
780.Lenc_max32:
781 movl $32, %r8d;
782
783 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
784 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
785 %xmm15,
786 ((key_table + (24) * 8) + 0)(CTX),
787 ((key_table + (24) * 8) + 4)(CTX),
788 ((key_table + (24) * 8) + 8)(CTX),
789 ((key_table + (24) * 8) + 12)(CTX));
790
791 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
792 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
793 %xmm15, %rax, %rcx, 24);
794
795 jmp .Lenc_done;
796
797.align 8
798.type __camellia_dec_blk16,@function;
799
800__camellia_dec_blk16:
801 /* input:
802 * %rdi: ctx, CTX
803 * %rax: temporary storage, 256 bytes
804 * %r8d: 24 for 16 byte key, 32 for larger
805 * %xmm0..%xmm15: 16 encrypted blocks
806 * output:
807 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
808 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
809 */
810
811 leaq 8 * 16(%rax), %rcx;
812
813 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
814 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
815 %xmm15, %rax, %rcx);
816
817 cmpl $32, %r8d;
818 je .Ldec_max32;
819
820.Ldec_max24:
821 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
822 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
823 %xmm15, %rax, %rcx, 16);
824
825 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
826 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
827 %xmm15,
828 ((key_table + (16) * 8) + 8)(CTX),
829 ((key_table + (16) * 8) + 12)(CTX),
830 ((key_table + (16) * 8) + 0)(CTX),
831 ((key_table + (16) * 8) + 4)(CTX));
832
833 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
834 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
835 %xmm15, %rax, %rcx, 8);
836
837 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
838 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
839 %xmm15,
840 ((key_table + (8) * 8) + 8)(CTX),
841 ((key_table + (8) * 8) + 12)(CTX),
842 ((key_table + (8) * 8) + 0)(CTX),
843 ((key_table + (8) * 8) + 4)(CTX));
844
845 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
847 %xmm15, %rax, %rcx, 0);
848
849 /* load CD for output */
850 vmovdqu 0 * 16(%rcx), %xmm8;
851 vmovdqu 1 * 16(%rcx), %xmm9;
852 vmovdqu 2 * 16(%rcx), %xmm10;
853 vmovdqu 3 * 16(%rcx), %xmm11;
854 vmovdqu 4 * 16(%rcx), %xmm12;
855 vmovdqu 5 * 16(%rcx), %xmm13;
856 vmovdqu 6 * 16(%rcx), %xmm14;
857 vmovdqu 7 * 16(%rcx), %xmm15;
858
859 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
860 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
862
863 ret;
864
865.align 8
866.Ldec_max32:
867 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
868 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
869 %xmm15, %rax, %rcx, 24);
870
871 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
872 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873 %xmm15,
874 ((key_table + (24) * 8) + 8)(CTX),
875 ((key_table + (24) * 8) + 12)(CTX),
876 ((key_table + (24) * 8) + 0)(CTX),
877 ((key_table + (24) * 8) + 4)(CTX));
878
879 jmp .Ldec_max24;
880
881.align 8
882.global camellia_ecb_enc_16way
883.type camellia_ecb_enc_16way,@function;
884
885camellia_ecb_enc_16way:
886 /* input:
887 * %rdi: ctx, CTX
888 * %rsi: dst (16 blocks)
889 * %rdx: src (16 blocks)
890 */
891
892 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
893 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894 %xmm15, %rdx, (key_table)(CTX));
895
896 /* now dst can be used as temporary buffer (even in src == dst case) */
897 movq %rsi, %rax;
898
899 call __camellia_enc_blk16;
900
901 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
902 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
903 %xmm8, %rsi);
904
905 ret;
906
907.align 8
908.global camellia_ecb_dec_16way
909.type camellia_ecb_dec_16way,@function;
910
911camellia_ecb_dec_16way:
912 /* input:
913 * %rdi: ctx, CTX
914 * %rsi: dst (16 blocks)
915 * %rdx: src (16 blocks)
916 */
917
918 cmpl $16, key_length(CTX);
919 movl $32, %r8d;
920 movl $24, %eax;
921 cmovel %eax, %r8d; /* max */
922
923 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
924 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
926
927 /* now dst can be used as temporary buffer (even in src == dst case) */
928 movq %rsi, %rax;
929
930 call __camellia_dec_blk16;
931
932 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
933 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
934 %xmm8, %rsi);
935
936 ret;
937
938.align 8
939.global camellia_cbc_dec_16way
940.type camellia_cbc_dec_16way,@function;
941
942camellia_cbc_dec_16way:
943 /* input:
944 * %rdi: ctx, CTX
945 * %rsi: dst (16 blocks)
946 * %rdx: src (16 blocks)
947 */
948
949 cmpl $16, key_length(CTX);
950 movl $32, %r8d;
951 movl $24, %eax;
952 cmovel %eax, %r8d; /* max */
953
954 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
955 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
956 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
957
958 /*
959 * dst might still be in-use (in case dst == src), so use stack for
960 * temporary storage.
961 */
962 subq $(16 * 16), %rsp;
963 movq %rsp, %rax;
964
965 call __camellia_dec_blk16;
966
967 addq $(16 * 16), %rsp;
968
969 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
970 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
971 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
972 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
973 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
974 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
975 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
976 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
977 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
978 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
979 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
980 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
981 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
982 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
983 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
984 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
985 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
986 %xmm8, %rsi);
987
988 ret;
989
990#define inc_le128(x, minus_one, tmp) \
991 vpcmpeqq minus_one, x, tmp; \
992 vpsubq minus_one, x, x; \
993 vpslldq $8, tmp, tmp; \
994 vpsubq tmp, x, x;
995
996.align 8
997.global camellia_ctr_16way
998.type camellia_ctr_16way,@function;
999
1000camellia_ctr_16way:
1001 /* input:
1002 * %rdi: ctx, CTX
1003 * %rsi: dst (16 blocks)
1004 * %rdx: src (16 blocks)
1005 * %rcx: iv (little endian, 128bit)
1006 */
1007
1008 subq $(16 * 16), %rsp;
1009 movq %rsp, %rax;
1010
1011 vmovdqa .Lbswap128_mask, %xmm14;
1012
1013 /* load IV and byteswap */
1014 vmovdqu (%rcx), %xmm0;
1015 vpshufb %xmm14, %xmm0, %xmm15;
1016 vmovdqu %xmm15, 15 * 16(%rax);
1017
1018 vpcmpeqd %xmm15, %xmm15, %xmm15;
1019 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1020
1021 /* construct IVs */
1022 inc_le128(%xmm0, %xmm15, %xmm13);
1023 vpshufb %xmm14, %xmm0, %xmm13;
1024 vmovdqu %xmm13, 14 * 16(%rax);
1025 inc_le128(%xmm0, %xmm15, %xmm13);
1026 vpshufb %xmm14, %xmm0, %xmm13;
1027 vmovdqu %xmm13, 13 * 16(%rax);
1028 inc_le128(%xmm0, %xmm15, %xmm13);
1029 vpshufb %xmm14, %xmm0, %xmm12;
1030 inc_le128(%xmm0, %xmm15, %xmm13);
1031 vpshufb %xmm14, %xmm0, %xmm11;
1032 inc_le128(%xmm0, %xmm15, %xmm13);
1033 vpshufb %xmm14, %xmm0, %xmm10;
1034 inc_le128(%xmm0, %xmm15, %xmm13);
1035 vpshufb %xmm14, %xmm0, %xmm9;
1036 inc_le128(%xmm0, %xmm15, %xmm13);
1037 vpshufb %xmm14, %xmm0, %xmm8;
1038 inc_le128(%xmm0, %xmm15, %xmm13);
1039 vpshufb %xmm14, %xmm0, %xmm7;
1040 inc_le128(%xmm0, %xmm15, %xmm13);
1041 vpshufb %xmm14, %xmm0, %xmm6;
1042 inc_le128(%xmm0, %xmm15, %xmm13);
1043 vpshufb %xmm14, %xmm0, %xmm5;
1044 inc_le128(%xmm0, %xmm15, %xmm13);
1045 vpshufb %xmm14, %xmm0, %xmm4;
1046 inc_le128(%xmm0, %xmm15, %xmm13);
1047 vpshufb %xmm14, %xmm0, %xmm3;
1048 inc_le128(%xmm0, %xmm15, %xmm13);
1049 vpshufb %xmm14, %xmm0, %xmm2;
1050 inc_le128(%xmm0, %xmm15, %xmm13);
1051 vpshufb %xmm14, %xmm0, %xmm1;
1052 inc_le128(%xmm0, %xmm15, %xmm13);
1053 vmovdqa %xmm0, %xmm13;
1054 vpshufb %xmm14, %xmm0, %xmm0;
1055 inc_le128(%xmm13, %xmm15, %xmm14);
1056 vmovdqu %xmm13, (%rcx);
1057
1058 /* inpack16_pre: */
1059 vmovq (key_table)(CTX), %xmm15;
1060 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1061 vpxor %xmm0, %xmm15, %xmm0;
1062 vpxor %xmm1, %xmm15, %xmm1;
1063 vpxor %xmm2, %xmm15, %xmm2;
1064 vpxor %xmm3, %xmm15, %xmm3;
1065 vpxor %xmm4, %xmm15, %xmm4;
1066 vpxor %xmm5, %xmm15, %xmm5;
1067 vpxor %xmm6, %xmm15, %xmm6;
1068 vpxor %xmm7, %xmm15, %xmm7;
1069 vpxor %xmm8, %xmm15, %xmm8;
1070 vpxor %xmm9, %xmm15, %xmm9;
1071 vpxor %xmm10, %xmm15, %xmm10;
1072 vpxor %xmm11, %xmm15, %xmm11;
1073 vpxor %xmm12, %xmm15, %xmm12;
1074 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1075 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1076 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1077
1078 call __camellia_enc_blk16;
1079
1080 addq $(16 * 16), %rsp;
1081
1082 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1083 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1084 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1085 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1086 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1087 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1088 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1089 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1090 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1091 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1092 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1093 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1094 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1095 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1096 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1097 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1098 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1099 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1100 %xmm8, %rsi);
1101
1102 ret;
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
deleted file mode 100644
index 0b3374335fd..00000000000
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ /dev/null
@@ -1,520 +0,0 @@
1/*
2 * Camellia Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23.file "camellia-x86_64-asm_64.S"
24.text
25
26.extern camellia_sp10011110;
27.extern camellia_sp22000222;
28.extern camellia_sp03303033;
29.extern camellia_sp00444404;
30.extern camellia_sp02220222;
31.extern camellia_sp30333033;
32.extern camellia_sp44044404;
33.extern camellia_sp11101110;
34
35#define sp10011110 camellia_sp10011110
36#define sp22000222 camellia_sp22000222
37#define sp03303033 camellia_sp03303033
38#define sp00444404 camellia_sp00444404
39#define sp02220222 camellia_sp02220222
40#define sp30333033 camellia_sp30333033
41#define sp44044404 camellia_sp44044404
42#define sp11101110 camellia_sp11101110
43
44#define CAMELLIA_TABLE_BYTE_LEN 272
45
46/* struct camellia_ctx: */
47#define key_table 0
48#define key_length CAMELLIA_TABLE_BYTE_LEN
49
50/* register macros */
51#define CTX %rdi
52#define RIO %rsi
53#define RIOd %esi
54
55#define RAB0 %rax
56#define RCD0 %rcx
57#define RAB1 %rbx
58#define RCD1 %rdx
59
60#define RAB0d %eax
61#define RCD0d %ecx
62#define RAB1d %ebx
63#define RCD1d %edx
64
65#define RAB0bl %al
66#define RCD0bl %cl
67#define RAB1bl %bl
68#define RCD1bl %dl
69
70#define RAB0bh %ah
71#define RCD0bh %ch
72#define RAB1bh %bh
73#define RCD1bh %dh
74
75#define RT0 %rsi
76#define RT1 %rbp
77#define RT2 %r8
78
79#define RT0d %esi
80#define RT1d %ebp
81#define RT2d %r8d
82
83#define RT2bl %r8b
84
85#define RXOR %r9
86#define RRBP %r10
87#define RDST %r11
88
89#define RXORd %r9d
90#define RXORbl %r9b
91
92#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
93 movzbl ab ## bl, tmp2 ## d; \
94 movzbl ab ## bh, tmp1 ## d; \
95 rorq $16, ab; \
96 xorq T0(, tmp2, 8), dst; \
97 xorq T1(, tmp1, 8), dst;
98
99/**********************************************************************
100 1-way camellia
101 **********************************************************************/
102#define roundsm(ab, subkey, cd) \
103 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
104 \
105 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
106 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
107 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
108 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
109 \
110 xorq RT2, cd ## 0;
111
112#define fls(l, r, kl, kr) \
113 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
114 andl l ## 0d, RT0d; \
115 roll $1, RT0d; \
116 shlq $32, RT0; \
117 xorq RT0, l ## 0; \
118 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
119 orq r ## 0, RT1; \
120 shrq $32, RT1; \
121 xorq RT1, r ## 0; \
122 \
123 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
124 orq l ## 0, RT2; \
125 shrq $32, RT2; \
126 xorq RT2, l ## 0; \
127 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
128 andl r ## 0d, RT0d; \
129 roll $1, RT0d; \
130 shlq $32, RT0; \
131 xorq RT0, r ## 0;
132
133#define enc_rounds(i) \
134 roundsm(RAB, i + 2, RCD); \
135 roundsm(RCD, i + 3, RAB); \
136 roundsm(RAB, i + 4, RCD); \
137 roundsm(RCD, i + 5, RAB); \
138 roundsm(RAB, i + 6, RCD); \
139 roundsm(RCD, i + 7, RAB);
140
141#define enc_fls(i) \
142 fls(RAB, RCD, i + 0, i + 1);
143
144#define enc_inpack() \
145 movq (RIO), RAB0; \
146 bswapq RAB0; \
147 rolq $32, RAB0; \
148 movq 4*2(RIO), RCD0; \
149 bswapq RCD0; \
150 rorq $32, RCD0; \
151 xorq key_table(CTX), RAB0;
152
153#define enc_outunpack(op, max) \
154 xorq key_table(CTX, max, 8), RCD0; \
155 rorq $32, RCD0; \
156 bswapq RCD0; \
157 op ## q RCD0, (RIO); \
158 rolq $32, RAB0; \
159 bswapq RAB0; \
160 op ## q RAB0, 4*2(RIO);
161
162#define dec_rounds(i) \
163 roundsm(RAB, i + 7, RCD); \
164 roundsm(RCD, i + 6, RAB); \
165 roundsm(RAB, i + 5, RCD); \
166 roundsm(RCD, i + 4, RAB); \
167 roundsm(RAB, i + 3, RCD); \
168 roundsm(RCD, i + 2, RAB);
169
170#define dec_fls(i) \
171 fls(RAB, RCD, i + 1, i + 0);
172
173#define dec_inpack(max) \
174 movq (RIO), RAB0; \
175 bswapq RAB0; \
176 rolq $32, RAB0; \
177 movq 4*2(RIO), RCD0; \
178 bswapq RCD0; \
179 rorq $32, RCD0; \
180 xorq key_table(CTX, max, 8), RAB0;
181
182#define dec_outunpack() \
183 xorq key_table(CTX), RCD0; \
184 rorq $32, RCD0; \
185 bswapq RCD0; \
186 movq RCD0, (RIO); \
187 rolq $32, RAB0; \
188 bswapq RAB0; \
189 movq RAB0, 4*2(RIO);
190
191.global __camellia_enc_blk;
192.type __camellia_enc_blk,@function;
193
194__camellia_enc_blk:
195 /* input:
196 * %rdi: ctx, CTX
197 * %rsi: dst
198 * %rdx: src
199 * %rcx: bool xor
200 */
201 movq %rbp, RRBP;
202
203 movq %rcx, RXOR;
204 movq %rsi, RDST;
205 movq %rdx, RIO;
206
207 enc_inpack();
208
209 enc_rounds(0);
210 enc_fls(8);
211 enc_rounds(8);
212 enc_fls(16);
213 enc_rounds(16);
214 movl $24, RT1d; /* max */
215
216 cmpb $16, key_length(CTX);
217 je __enc_done;
218
219 enc_fls(24);
220 enc_rounds(24);
221 movl $32, RT1d; /* max */
222
223__enc_done:
224 testb RXORbl, RXORbl;
225 movq RDST, RIO;
226
227 jnz __enc_xor;
228
229 enc_outunpack(mov, RT1);
230
231 movq RRBP, %rbp;
232 ret;
233
234__enc_xor:
235 enc_outunpack(xor, RT1);
236
237 movq RRBP, %rbp;
238 ret;
239
240.global camellia_dec_blk;
241.type camellia_dec_blk,@function;
242
243camellia_dec_blk:
244 /* input:
245 * %rdi: ctx, CTX
246 * %rsi: dst
247 * %rdx: src
248 */
249 cmpl $16, key_length(CTX);
250 movl $32, RT2d;
251 movl $24, RXORd;
252 cmovel RXORd, RT2d; /* max */
253
254 movq %rbp, RRBP;
255 movq %rsi, RDST;
256 movq %rdx, RIO;
257
258 dec_inpack(RT2);
259
260 cmpb $24, RT2bl;
261 je __dec_rounds16;
262
263 dec_rounds(24);
264 dec_fls(24);
265
266__dec_rounds16:
267 dec_rounds(16);
268 dec_fls(16);
269 dec_rounds(8);
270 dec_fls(8);
271 dec_rounds(0);
272
273 movq RDST, RIO;
274
275 dec_outunpack();
276
277 movq RRBP, %rbp;
278 ret;
279
280/**********************************************************************
281 2-way camellia
282 **********************************************************************/
283#define roundsm2(ab, subkey, cd) \
284 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
285 xorq RT2, cd ## 1; \
286 \
287 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
288 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
289 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
290 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
291 \
292 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
293 xorq RT2, cd ## 0; \
294 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
295 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
296 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
297
298#define fls2(l, r, kl, kr) \
299 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
300 andl l ## 0d, RT0d; \
301 roll $1, RT0d; \
302 shlq $32, RT0; \
303 xorq RT0, l ## 0; \
304 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
305 orq r ## 0, RT1; \
306 shrq $32, RT1; \
307 xorq RT1, r ## 0; \
308 \
309 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
310 andl l ## 1d, RT2d; \
311 roll $1, RT2d; \
312 shlq $32, RT2; \
313 xorq RT2, l ## 1; \
314 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
315 orq r ## 1, RT0; \
316 shrq $32, RT0; \
317 xorq RT0, r ## 1; \
318 \
319 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
320 orq l ## 0, RT1; \
321 shrq $32, RT1; \
322 xorq RT1, l ## 0; \
323 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
324 andl r ## 0d, RT2d; \
325 roll $1, RT2d; \
326 shlq $32, RT2; \
327 xorq RT2, r ## 0; \
328 \
329 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
330 orq l ## 1, RT0; \
331 shrq $32, RT0; \
332 xorq RT0, l ## 1; \
333 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
334 andl r ## 1d, RT1d; \
335 roll $1, RT1d; \
336 shlq $32, RT1; \
337 xorq RT1, r ## 1;
338
339#define enc_rounds2(i) \
340 roundsm2(RAB, i + 2, RCD); \
341 roundsm2(RCD, i + 3, RAB); \
342 roundsm2(RAB, i + 4, RCD); \
343 roundsm2(RCD, i + 5, RAB); \
344 roundsm2(RAB, i + 6, RCD); \
345 roundsm2(RCD, i + 7, RAB);
346
347#define enc_fls2(i) \
348 fls2(RAB, RCD, i + 0, i + 1);
349
350#define enc_inpack2() \
351 movq (RIO), RAB0; \
352 bswapq RAB0; \
353 rorq $32, RAB0; \
354 movq 4*2(RIO), RCD0; \
355 bswapq RCD0; \
356 rolq $32, RCD0; \
357 xorq key_table(CTX), RAB0; \
358 \
359 movq 8*2(RIO), RAB1; \
360 bswapq RAB1; \
361 rorq $32, RAB1; \
362 movq 12*2(RIO), RCD1; \
363 bswapq RCD1; \
364 rolq $32, RCD1; \
365 xorq key_table(CTX), RAB1;
366
367#define enc_outunpack2(op, max) \
368 xorq key_table(CTX, max, 8), RCD0; \
369 rolq $32, RCD0; \
370 bswapq RCD0; \
371 op ## q RCD0, (RIO); \
372 rorq $32, RAB0; \
373 bswapq RAB0; \
374 op ## q RAB0, 4*2(RIO); \
375 \
376 xorq key_table(CTX, max, 8), RCD1; \
377 rolq $32, RCD1; \
378 bswapq RCD1; \
379 op ## q RCD1, 8*2(RIO); \
380 rorq $32, RAB1; \
381 bswapq RAB1; \
382 op ## q RAB1, 12*2(RIO);
383
384#define dec_rounds2(i) \
385 roundsm2(RAB, i + 7, RCD); \
386 roundsm2(RCD, i + 6, RAB); \
387 roundsm2(RAB, i + 5, RCD); \
388 roundsm2(RCD, i + 4, RAB); \
389 roundsm2(RAB, i + 3, RCD); \
390 roundsm2(RCD, i + 2, RAB);
391
392#define dec_fls2(i) \
393 fls2(RAB, RCD, i + 1, i + 0);
394
395#define dec_inpack2(max) \
396 movq (RIO), RAB0; \
397 bswapq RAB0; \
398 rorq $32, RAB0; \
399 movq 4*2(RIO), RCD0; \
400 bswapq RCD0; \
401 rolq $32, RCD0; \
402 xorq key_table(CTX, max, 8), RAB0; \
403 \
404 movq 8*2(RIO), RAB1; \
405 bswapq RAB1; \
406 rorq $32, RAB1; \
407 movq 12*2(RIO), RCD1; \
408 bswapq RCD1; \
409 rolq $32, RCD1; \
410 xorq key_table(CTX, max, 8), RAB1;
411
412#define dec_outunpack2() \
413 xorq key_table(CTX), RCD0; \
414 rolq $32, RCD0; \
415 bswapq RCD0; \
416 movq RCD0, (RIO); \
417 rorq $32, RAB0; \
418 bswapq RAB0; \
419 movq RAB0, 4*2(RIO); \
420 \
421 xorq key_table(CTX), RCD1; \
422 rolq $32, RCD1; \
423 bswapq RCD1; \
424 movq RCD1, 8*2(RIO); \
425 rorq $32, RAB1; \
426 bswapq RAB1; \
427 movq RAB1, 12*2(RIO);
428
429.global __camellia_enc_blk_2way;
430.type __camellia_enc_blk_2way,@function;
431
432__camellia_enc_blk_2way:
433 /* input:
434 * %rdi: ctx, CTX
435 * %rsi: dst
436 * %rdx: src
437 * %rcx: bool xor
438 */
439 pushq %rbx;
440
441 movq %rbp, RRBP;
442 movq %rcx, RXOR;
443 movq %rsi, RDST;
444 movq %rdx, RIO;
445
446 enc_inpack2();
447
448 enc_rounds2(0);
449 enc_fls2(8);
450 enc_rounds2(8);
451 enc_fls2(16);
452 enc_rounds2(16);
453 movl $24, RT2d; /* max */
454
455 cmpb $16, key_length(CTX);
456 je __enc2_done;
457
458 enc_fls2(24);
459 enc_rounds2(24);
460 movl $32, RT2d; /* max */
461
462__enc2_done:
463 test RXORbl, RXORbl;
464 movq RDST, RIO;
465 jnz __enc2_xor;
466
467 enc_outunpack2(mov, RT2);
468
469 movq RRBP, %rbp;
470 popq %rbx;
471 ret;
472
473__enc2_xor:
474 enc_outunpack2(xor, RT2);
475
476 movq RRBP, %rbp;
477 popq %rbx;
478 ret;
479
480.global camellia_dec_blk_2way;
481.type camellia_dec_blk_2way,@function;
482
483camellia_dec_blk_2way:
484 /* input:
485 * %rdi: ctx, CTX
486 * %rsi: dst
487 * %rdx: src
488 */
489 cmpl $16, key_length(CTX);
490 movl $32, RT2d;
491 movl $24, RXORd;
492 cmovel RXORd, RT2d; /* max */
493
494 movq %rbx, RXOR;
495 movq %rbp, RRBP;
496 movq %rsi, RDST;
497 movq %rdx, RIO;
498
499 dec_inpack2(RT2);
500
501 cmpb $24, RT2bl;
502 je __dec2_rounds16;
503
504 dec_rounds2(24);
505 dec_fls2(24);
506
507__dec2_rounds16:
508 dec_rounds2(16);
509 dec_fls2(16);
510 dec_rounds2(8);
511 dec_fls2(8);
512 dec_rounds2(0);
513
514 movq RDST, RIO;
515
516 dec_outunpack2();
517
518 movq RRBP, %rbp;
519 movq RXOR, %rbx;
520 ret;
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
deleted file mode 100644
index 96cbb6068fc..00000000000
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ /dev/null
@@ -1,558 +0,0 @@
1/*
2 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/crypto.h>
16#include <linux/err.h>
17#include <crypto/algapi.h>
18#include <crypto/ctr.h>
19#include <crypto/lrw.h>
20#include <crypto/xts.h>
21#include <asm/xcr.h>
22#include <asm/xsave.h>
23#include <asm/crypto/camellia.h>
24#include <asm/crypto/ablk_helper.h>
25#include <asm/crypto/glue_helper.h>
26
27#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
28
29/* 16-way AES-NI parallel cipher functions */
30asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
31 const u8 *src);
32asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
33 const u8 *src);
34
35asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
36 const u8 *src);
37asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
38 const u8 *src, le128 *iv);
39
40static const struct common_glue_ctx camellia_enc = {
41 .num_funcs = 3,
42 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
43
44 .funcs = { {
45 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
46 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
47 }, {
48 .num_blocks = 2,
49 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
50 }, {
51 .num_blocks = 1,
52 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
53 } }
54};
55
56static const struct common_glue_ctx camellia_ctr = {
57 .num_funcs = 3,
58 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
59
60 .funcs = { {
61 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
62 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
63 }, {
64 .num_blocks = 2,
65 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
66 }, {
67 .num_blocks = 1,
68 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
69 } }
70};
71
72static const struct common_glue_ctx camellia_dec = {
73 .num_funcs = 3,
74 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
75
76 .funcs = { {
77 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
78 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
79 }, {
80 .num_blocks = 2,
81 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
82 }, {
83 .num_blocks = 1,
84 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
85 } }
86};
87
88static const struct common_glue_ctx camellia_dec_cbc = {
89 .num_funcs = 3,
90 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
91
92 .funcs = { {
93 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
94 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
95 }, {
96 .num_blocks = 2,
97 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
98 }, {
99 .num_blocks = 1,
100 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
101 } }
102};
103
104static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
105 struct scatterlist *src, unsigned int nbytes)
106{
107 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
108}
109
110static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
111 struct scatterlist *src, unsigned int nbytes)
112{
113 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
114}
115
116static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
117 struct scatterlist *src, unsigned int nbytes)
118{
119 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
120 dst, src, nbytes);
121}
122
123static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
124 struct scatterlist *src, unsigned int nbytes)
125{
126 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
127 nbytes);
128}
129
130static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
134}
135
136static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
137{
138 return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
139 CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
140 nbytes);
141}
142
143static inline void camellia_fpu_end(bool fpu_enabled)
144{
145 glue_fpu_end(fpu_enabled);
146}
147
148static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
149 unsigned int key_len)
150{
151 return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
152 &tfm->crt_flags);
153}
154
155struct crypt_priv {
156 struct camellia_ctx *ctx;
157 bool fpu_enabled;
158};
159
160static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
161{
162 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
163 struct crypt_priv *ctx = priv;
164 int i;
165
166 ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
167
168 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
169 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
170 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
171 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
172 }
173
174 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
175 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
176 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
177 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
178 }
179
180 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
181 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
182}
183
184static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
185{
186 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
187 struct crypt_priv *ctx = priv;
188 int i;
189
190 ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
191
192 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
193 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
194 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
195 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
196 }
197
198 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
199 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
200 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
201 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
202 }
203
204 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
205 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
206}
207
208static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
209 struct scatterlist *src, unsigned int nbytes)
210{
211 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
212 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
213 struct crypt_priv crypt_ctx = {
214 .ctx = &ctx->camellia_ctx,
215 .fpu_enabled = false,
216 };
217 struct lrw_crypt_req req = {
218 .tbuf = buf,
219 .tbuflen = sizeof(buf),
220
221 .table_ctx = &ctx->lrw_table,
222 .crypt_ctx = &crypt_ctx,
223 .crypt_fn = encrypt_callback,
224 };
225 int ret;
226
227 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
228 ret = lrw_crypt(desc, dst, src, nbytes, &req);
229 camellia_fpu_end(crypt_ctx.fpu_enabled);
230
231 return ret;
232}
233
234static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
235 struct scatterlist *src, unsigned int nbytes)
236{
237 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
238 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
239 struct crypt_priv crypt_ctx = {
240 .ctx = &ctx->camellia_ctx,
241 .fpu_enabled = false,
242 };
243 struct lrw_crypt_req req = {
244 .tbuf = buf,
245 .tbuflen = sizeof(buf),
246
247 .table_ctx = &ctx->lrw_table,
248 .crypt_ctx = &crypt_ctx,
249 .crypt_fn = decrypt_callback,
250 };
251 int ret;
252
253 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
254 ret = lrw_crypt(desc, dst, src, nbytes, &req);
255 camellia_fpu_end(crypt_ctx.fpu_enabled);
256
257 return ret;
258}
259
260static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
261 struct scatterlist *src, unsigned int nbytes)
262{
263 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
264 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
265 struct crypt_priv crypt_ctx = {
266 .ctx = &ctx->crypt_ctx,
267 .fpu_enabled = false,
268 };
269 struct xts_crypt_req req = {
270 .tbuf = buf,
271 .tbuflen = sizeof(buf),
272
273 .tweak_ctx = &ctx->tweak_ctx,
274 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
275 .crypt_ctx = &crypt_ctx,
276 .crypt_fn = encrypt_callback,
277 };
278 int ret;
279
280 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
281 ret = xts_crypt(desc, dst, src, nbytes, &req);
282 camellia_fpu_end(crypt_ctx.fpu_enabled);
283
284 return ret;
285}
286
287static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
288 struct scatterlist *src, unsigned int nbytes)
289{
290 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
291 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
292 struct crypt_priv crypt_ctx = {
293 .ctx = &ctx->crypt_ctx,
294 .fpu_enabled = false,
295 };
296 struct xts_crypt_req req = {
297 .tbuf = buf,
298 .tbuflen = sizeof(buf),
299
300 .tweak_ctx = &ctx->tweak_ctx,
301 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
302 .crypt_ctx = &crypt_ctx,
303 .crypt_fn = decrypt_callback,
304 };
305 int ret;
306
307 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
308 ret = xts_crypt(desc, dst, src, nbytes, &req);
309 camellia_fpu_end(crypt_ctx.fpu_enabled);
310
311 return ret;
312}
313
314static struct crypto_alg cmll_algs[10] = { {
315 .cra_name = "__ecb-camellia-aesni",
316 .cra_driver_name = "__driver-ecb-camellia-aesni",
317 .cra_priority = 0,
318 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
319 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
320 .cra_ctxsize = sizeof(struct camellia_ctx),
321 .cra_alignmask = 0,
322 .cra_type = &crypto_blkcipher_type,
323 .cra_module = THIS_MODULE,
324 .cra_u = {
325 .blkcipher = {
326 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
327 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
328 .setkey = camellia_setkey,
329 .encrypt = ecb_encrypt,
330 .decrypt = ecb_decrypt,
331 },
332 },
333}, {
334 .cra_name = "__cbc-camellia-aesni",
335 .cra_driver_name = "__driver-cbc-camellia-aesni",
336 .cra_priority = 0,
337 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
338 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
339 .cra_ctxsize = sizeof(struct camellia_ctx),
340 .cra_alignmask = 0,
341 .cra_type = &crypto_blkcipher_type,
342 .cra_module = THIS_MODULE,
343 .cra_u = {
344 .blkcipher = {
345 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
346 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
347 .setkey = camellia_setkey,
348 .encrypt = cbc_encrypt,
349 .decrypt = cbc_decrypt,
350 },
351 },
352}, {
353 .cra_name = "__ctr-camellia-aesni",
354 .cra_driver_name = "__driver-ctr-camellia-aesni",
355 .cra_priority = 0,
356 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
357 .cra_blocksize = 1,
358 .cra_ctxsize = sizeof(struct camellia_ctx),
359 .cra_alignmask = 0,
360 .cra_type = &crypto_blkcipher_type,
361 .cra_module = THIS_MODULE,
362 .cra_u = {
363 .blkcipher = {
364 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
365 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
366 .ivsize = CAMELLIA_BLOCK_SIZE,
367 .setkey = camellia_setkey,
368 .encrypt = ctr_crypt,
369 .decrypt = ctr_crypt,
370 },
371 },
372}, {
373 .cra_name = "__lrw-camellia-aesni",
374 .cra_driver_name = "__driver-lrw-camellia-aesni",
375 .cra_priority = 0,
376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
377 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
378 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
379 .cra_alignmask = 0,
380 .cra_type = &crypto_blkcipher_type,
381 .cra_module = THIS_MODULE,
382 .cra_exit = lrw_camellia_exit_tfm,
383 .cra_u = {
384 .blkcipher = {
385 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
386 CAMELLIA_BLOCK_SIZE,
387 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
388 CAMELLIA_BLOCK_SIZE,
389 .ivsize = CAMELLIA_BLOCK_SIZE,
390 .setkey = lrw_camellia_setkey,
391 .encrypt = lrw_encrypt,
392 .decrypt = lrw_decrypt,
393 },
394 },
395}, {
396 .cra_name = "__xts-camellia-aesni",
397 .cra_driver_name = "__driver-xts-camellia-aesni",
398 .cra_priority = 0,
399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
400 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
401 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
402 .cra_alignmask = 0,
403 .cra_type = &crypto_blkcipher_type,
404 .cra_module = THIS_MODULE,
405 .cra_u = {
406 .blkcipher = {
407 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
408 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
409 .ivsize = CAMELLIA_BLOCK_SIZE,
410 .setkey = xts_camellia_setkey,
411 .encrypt = xts_encrypt,
412 .decrypt = xts_decrypt,
413 },
414 },
415}, {
416 .cra_name = "ecb(camellia)",
417 .cra_driver_name = "ecb-camellia-aesni",
418 .cra_priority = 400,
419 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
420 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
421 .cra_ctxsize = sizeof(struct async_helper_ctx),
422 .cra_alignmask = 0,
423 .cra_type = &crypto_ablkcipher_type,
424 .cra_module = THIS_MODULE,
425 .cra_init = ablk_init,
426 .cra_exit = ablk_exit,
427 .cra_u = {
428 .ablkcipher = {
429 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
430 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
431 .setkey = ablk_set_key,
432 .encrypt = ablk_encrypt,
433 .decrypt = ablk_decrypt,
434 },
435 },
436}, {
437 .cra_name = "cbc(camellia)",
438 .cra_driver_name = "cbc-camellia-aesni",
439 .cra_priority = 400,
440 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
441 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
442 .cra_ctxsize = sizeof(struct async_helper_ctx),
443 .cra_alignmask = 0,
444 .cra_type = &crypto_ablkcipher_type,
445 .cra_module = THIS_MODULE,
446 .cra_init = ablk_init,
447 .cra_exit = ablk_exit,
448 .cra_u = {
449 .ablkcipher = {
450 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
451 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
452 .ivsize = CAMELLIA_BLOCK_SIZE,
453 .setkey = ablk_set_key,
454 .encrypt = __ablk_encrypt,
455 .decrypt = ablk_decrypt,
456 },
457 },
458}, {
459 .cra_name = "ctr(camellia)",
460 .cra_driver_name = "ctr-camellia-aesni",
461 .cra_priority = 400,
462 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
463 .cra_blocksize = 1,
464 .cra_ctxsize = sizeof(struct async_helper_ctx),
465 .cra_alignmask = 0,
466 .cra_type = &crypto_ablkcipher_type,
467 .cra_module = THIS_MODULE,
468 .cra_init = ablk_init,
469 .cra_exit = ablk_exit,
470 .cra_u = {
471 .ablkcipher = {
472 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
473 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
474 .ivsize = CAMELLIA_BLOCK_SIZE,
475 .setkey = ablk_set_key,
476 .encrypt = ablk_encrypt,
477 .decrypt = ablk_encrypt,
478 .geniv = "chainiv",
479 },
480 },
481}, {
482 .cra_name = "lrw(camellia)",
483 .cra_driver_name = "lrw-camellia-aesni",
484 .cra_priority = 400,
485 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
486 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
487 .cra_ctxsize = sizeof(struct async_helper_ctx),
488 .cra_alignmask = 0,
489 .cra_type = &crypto_ablkcipher_type,
490 .cra_module = THIS_MODULE,
491 .cra_init = ablk_init,
492 .cra_exit = ablk_exit,
493 .cra_u = {
494 .ablkcipher = {
495 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
496 CAMELLIA_BLOCK_SIZE,
497 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
498 CAMELLIA_BLOCK_SIZE,
499 .ivsize = CAMELLIA_BLOCK_SIZE,
500 .setkey = ablk_set_key,
501 .encrypt = ablk_encrypt,
502 .decrypt = ablk_decrypt,
503 },
504 },
505}, {
506 .cra_name = "xts(camellia)",
507 .cra_driver_name = "xts-camellia-aesni",
508 .cra_priority = 400,
509 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
510 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
511 .cra_ctxsize = sizeof(struct async_helper_ctx),
512 .cra_alignmask = 0,
513 .cra_type = &crypto_ablkcipher_type,
514 .cra_module = THIS_MODULE,
515 .cra_init = ablk_init,
516 .cra_exit = ablk_exit,
517 .cra_u = {
518 .ablkcipher = {
519 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
520 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
521 .ivsize = CAMELLIA_BLOCK_SIZE,
522 .setkey = ablk_set_key,
523 .encrypt = ablk_encrypt,
524 .decrypt = ablk_decrypt,
525 },
526 },
527} };
528
529static int __init camellia_aesni_init(void)
530{
531 u64 xcr0;
532
533 if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
534 pr_info("AVX or AES-NI instructions are not detected.\n");
535 return -ENODEV;
536 }
537
538 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
539 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
540 pr_info("AVX detected but unusable.\n");
541 return -ENODEV;
542 }
543
544 return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
545}
546
547static void __exit camellia_aesni_fini(void)
548{
549 crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
550}
551
552module_init(camellia_aesni_init);
553module_exit(camellia_aesni_fini);
554
555MODULE_LICENSE("GPL");
556MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
557MODULE_ALIAS("camellia");
558MODULE_ALIAS("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
deleted file mode 100644
index 5cb86ccd4ac..00000000000
--- a/arch/x86/crypto/camellia_glue.c
+++ /dev/null
@@ -1,1729 +0,0 @@
1/*
2 * Glue Code for assembler optimized version of Camellia
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Camellia parts based on code by:
7 * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * USA
23 *
24 */
25
26#include <asm/processor.h>
27#include <asm/unaligned.h>
28#include <linux/crypto.h>
29#include <linux/init.h>
30#include <linux/module.h>
31#include <linux/types.h>
32#include <crypto/algapi.h>
33#include <crypto/lrw.h>
34#include <crypto/xts.h>
35#include <asm/crypto/camellia.h>
36#include <asm/crypto/glue_helper.h>
37
38/* regular block cipher functions */
39asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
40 const u8 *src, bool xor);
41EXPORT_SYMBOL_GPL(__camellia_enc_blk);
42asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
43 const u8 *src);
44EXPORT_SYMBOL_GPL(camellia_dec_blk);
45
46/* 2-way parallel cipher functions */
47asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
48 const u8 *src, bool xor);
49EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
50asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
51 const u8 *src);
52EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
53
54static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
55{
56 camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src);
57}
58
59static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
60{
61 camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src);
62}
63
64/* camellia sboxes */
65const u64 camellia_sp10011110[256] = {
66 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
67 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
68 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
69 0x8500008585858500ULL, 0x5700005757575700ULL, 0x3500003535353500ULL,
70 0xea0000eaeaeaea00ULL, 0x0c00000c0c0c0c00ULL, 0xae0000aeaeaeae00ULL,
71 0x4100004141414100ULL, 0x2300002323232300ULL, 0xef0000efefefef00ULL,
72 0x6b00006b6b6b6b00ULL, 0x9300009393939300ULL, 0x4500004545454500ULL,
73 0x1900001919191900ULL, 0xa50000a5a5a5a500ULL, 0x2100002121212100ULL,
74 0xed0000edededed00ULL, 0x0e00000e0e0e0e00ULL, 0x4f00004f4f4f4f00ULL,
75 0x4e00004e4e4e4e00ULL, 0x1d00001d1d1d1d00ULL, 0x6500006565656500ULL,
76 0x9200009292929200ULL, 0xbd0000bdbdbdbd00ULL, 0x8600008686868600ULL,
77 0xb80000b8b8b8b800ULL, 0xaf0000afafafaf00ULL, 0x8f00008f8f8f8f00ULL,
78 0x7c00007c7c7c7c00ULL, 0xeb0000ebebebeb00ULL, 0x1f00001f1f1f1f00ULL,
79 0xce0000cececece00ULL, 0x3e00003e3e3e3e00ULL, 0x3000003030303000ULL,
80 0xdc0000dcdcdcdc00ULL, 0x5f00005f5f5f5f00ULL, 0x5e00005e5e5e5e00ULL,
81 0xc50000c5c5c5c500ULL, 0x0b00000b0b0b0b00ULL, 0x1a00001a1a1a1a00ULL,
82 0xa60000a6a6a6a600ULL, 0xe10000e1e1e1e100ULL, 0x3900003939393900ULL,
83 0xca0000cacacaca00ULL, 0xd50000d5d5d5d500ULL, 0x4700004747474700ULL,
84 0x5d00005d5d5d5d00ULL, 0x3d00003d3d3d3d00ULL, 0xd90000d9d9d9d900ULL,
85 0x0100000101010100ULL, 0x5a00005a5a5a5a00ULL, 0xd60000d6d6d6d600ULL,
86 0x5100005151515100ULL, 0x5600005656565600ULL, 0x6c00006c6c6c6c00ULL,
87 0x4d00004d4d4d4d00ULL, 0x8b00008b8b8b8b00ULL, 0x0d00000d0d0d0d00ULL,
88 0x9a00009a9a9a9a00ULL, 0x6600006666666600ULL, 0xfb0000fbfbfbfb00ULL,
89 0xcc0000cccccccc00ULL, 0xb00000b0b0b0b000ULL, 0x2d00002d2d2d2d00ULL,
90 0x7400007474747400ULL, 0x1200001212121200ULL, 0x2b00002b2b2b2b00ULL,
91 0x2000002020202000ULL, 0xf00000f0f0f0f000ULL, 0xb10000b1b1b1b100ULL,
92 0x8400008484848400ULL, 0x9900009999999900ULL, 0xdf0000dfdfdfdf00ULL,
93 0x4c00004c4c4c4c00ULL, 0xcb0000cbcbcbcb00ULL, 0xc20000c2c2c2c200ULL,
94 0x3400003434343400ULL, 0x7e00007e7e7e7e00ULL, 0x7600007676767600ULL,
95 0x0500000505050500ULL, 0x6d00006d6d6d6d00ULL, 0xb70000b7b7b7b700ULL,
96 0xa90000a9a9a9a900ULL, 0x3100003131313100ULL, 0xd10000d1d1d1d100ULL,
97 0x1700001717171700ULL, 0x0400000404040400ULL, 0xd70000d7d7d7d700ULL,
98 0x1400001414141400ULL, 0x5800005858585800ULL, 0x3a00003a3a3a3a00ULL,
99 0x6100006161616100ULL, 0xde0000dededede00ULL, 0x1b00001b1b1b1b00ULL,
100 0x1100001111111100ULL, 0x1c00001c1c1c1c00ULL, 0x3200003232323200ULL,
101 0x0f00000f0f0f0f00ULL, 0x9c00009c9c9c9c00ULL, 0x1600001616161600ULL,
102 0x5300005353535300ULL, 0x1800001818181800ULL, 0xf20000f2f2f2f200ULL,
103 0x2200002222222200ULL, 0xfe0000fefefefe00ULL, 0x4400004444444400ULL,
104 0xcf0000cfcfcfcf00ULL, 0xb20000b2b2b2b200ULL, 0xc30000c3c3c3c300ULL,
105 0xb50000b5b5b5b500ULL, 0x7a00007a7a7a7a00ULL, 0x9100009191919100ULL,
106 0x2400002424242400ULL, 0x0800000808080800ULL, 0xe80000e8e8e8e800ULL,
107 0xa80000a8a8a8a800ULL, 0x6000006060606000ULL, 0xfc0000fcfcfcfc00ULL,
108 0x6900006969696900ULL, 0x5000005050505000ULL, 0xaa0000aaaaaaaa00ULL,
109 0xd00000d0d0d0d000ULL, 0xa00000a0a0a0a000ULL, 0x7d00007d7d7d7d00ULL,
110 0xa10000a1a1a1a100ULL, 0x8900008989898900ULL, 0x6200006262626200ULL,
111 0x9700009797979700ULL, 0x5400005454545400ULL, 0x5b00005b5b5b5b00ULL,
112 0x1e00001e1e1e1e00ULL, 0x9500009595959500ULL, 0xe00000e0e0e0e000ULL,
113 0xff0000ffffffff00ULL, 0x6400006464646400ULL, 0xd20000d2d2d2d200ULL,
114 0x1000001010101000ULL, 0xc40000c4c4c4c400ULL, 0x0000000000000000ULL,
115 0x4800004848484800ULL, 0xa30000a3a3a3a300ULL, 0xf70000f7f7f7f700ULL,
116 0x7500007575757500ULL, 0xdb0000dbdbdbdb00ULL, 0x8a00008a8a8a8a00ULL,
117 0x0300000303030300ULL, 0xe60000e6e6e6e600ULL, 0xda0000dadadada00ULL,
118 0x0900000909090900ULL, 0x3f00003f3f3f3f00ULL, 0xdd0000dddddddd00ULL,
119 0x9400009494949400ULL, 0x8700008787878700ULL, 0x5c00005c5c5c5c00ULL,
120 0x8300008383838300ULL, 0x0200000202020200ULL, 0xcd0000cdcdcdcd00ULL,
121 0x4a00004a4a4a4a00ULL, 0x9000009090909000ULL, 0x3300003333333300ULL,
122 0x7300007373737300ULL, 0x6700006767676700ULL, 0xf60000f6f6f6f600ULL,
123 0xf30000f3f3f3f300ULL, 0x9d00009d9d9d9d00ULL, 0x7f00007f7f7f7f00ULL,
124 0xbf0000bfbfbfbf00ULL, 0xe20000e2e2e2e200ULL, 0x5200005252525200ULL,
125 0x9b00009b9b9b9b00ULL, 0xd80000d8d8d8d800ULL, 0x2600002626262600ULL,
126 0xc80000c8c8c8c800ULL, 0x3700003737373700ULL, 0xc60000c6c6c6c600ULL,
127 0x3b00003b3b3b3b00ULL, 0x8100008181818100ULL, 0x9600009696969600ULL,
128 0x6f00006f6f6f6f00ULL, 0x4b00004b4b4b4b00ULL, 0x1300001313131300ULL,
129 0xbe0000bebebebe00ULL, 0x6300006363636300ULL, 0x2e00002e2e2e2e00ULL,
130 0xe90000e9e9e9e900ULL, 0x7900007979797900ULL, 0xa70000a7a7a7a700ULL,
131 0x8c00008c8c8c8c00ULL, 0x9f00009f9f9f9f00ULL, 0x6e00006e6e6e6e00ULL,
132 0xbc0000bcbcbcbc00ULL, 0x8e00008e8e8e8e00ULL, 0x2900002929292900ULL,
133 0xf50000f5f5f5f500ULL, 0xf90000f9f9f9f900ULL, 0xb60000b6b6b6b600ULL,
134 0x2f00002f2f2f2f00ULL, 0xfd0000fdfdfdfd00ULL, 0xb40000b4b4b4b400ULL,
135 0x5900005959595900ULL, 0x7800007878787800ULL, 0x9800009898989800ULL,
136 0x0600000606060600ULL, 0x6a00006a6a6a6a00ULL, 0xe70000e7e7e7e700ULL,
137 0x4600004646464600ULL, 0x7100007171717100ULL, 0xba0000babababa00ULL,
138 0xd40000d4d4d4d400ULL, 0x2500002525252500ULL, 0xab0000abababab00ULL,
139 0x4200004242424200ULL, 0x8800008888888800ULL, 0xa20000a2a2a2a200ULL,
140 0x8d00008d8d8d8d00ULL, 0xfa0000fafafafa00ULL, 0x7200007272727200ULL,
141 0x0700000707070700ULL, 0xb90000b9b9b9b900ULL, 0x5500005555555500ULL,
142 0xf80000f8f8f8f800ULL, 0xee0000eeeeeeee00ULL, 0xac0000acacacac00ULL,
143 0x0a00000a0a0a0a00ULL, 0x3600003636363600ULL, 0x4900004949494900ULL,
144 0x2a00002a2a2a2a00ULL, 0x6800006868686800ULL, 0x3c00003c3c3c3c00ULL,
145 0x3800003838383800ULL, 0xf10000f1f1f1f100ULL, 0xa40000a4a4a4a400ULL,
146 0x4000004040404000ULL, 0x2800002828282800ULL, 0xd30000d3d3d3d300ULL,
147 0x7b00007b7b7b7b00ULL, 0xbb0000bbbbbbbb00ULL, 0xc90000c9c9c9c900ULL,
148 0x4300004343434300ULL, 0xc10000c1c1c1c100ULL, 0x1500001515151500ULL,
149 0xe30000e3e3e3e300ULL, 0xad0000adadadad00ULL, 0xf40000f4f4f4f400ULL,
150 0x7700007777777700ULL, 0xc70000c7c7c7c700ULL, 0x8000008080808000ULL,
151 0x9e00009e9e9e9e00ULL,
152};
153
154const u64 camellia_sp22000222[256] = {
155 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
156 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
157 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
158 0x0b0b0000000b0b0bULL, 0xaeae000000aeaeaeULL, 0x6a6a0000006a6a6aULL,
159 0xd5d5000000d5d5d5ULL, 0x1818000000181818ULL, 0x5d5d0000005d5d5dULL,
160 0x8282000000828282ULL, 0x4646000000464646ULL, 0xdfdf000000dfdfdfULL,
161 0xd6d6000000d6d6d6ULL, 0x2727000000272727ULL, 0x8a8a0000008a8a8aULL,
162 0x3232000000323232ULL, 0x4b4b0000004b4b4bULL, 0x4242000000424242ULL,
163 0xdbdb000000dbdbdbULL, 0x1c1c0000001c1c1cULL, 0x9e9e0000009e9e9eULL,
164 0x9c9c0000009c9c9cULL, 0x3a3a0000003a3a3aULL, 0xcaca000000cacacaULL,
165 0x2525000000252525ULL, 0x7b7b0000007b7b7bULL, 0x0d0d0000000d0d0dULL,
166 0x7171000000717171ULL, 0x5f5f0000005f5f5fULL, 0x1f1f0000001f1f1fULL,
167 0xf8f8000000f8f8f8ULL, 0xd7d7000000d7d7d7ULL, 0x3e3e0000003e3e3eULL,
168 0x9d9d0000009d9d9dULL, 0x7c7c0000007c7c7cULL, 0x6060000000606060ULL,
169 0xb9b9000000b9b9b9ULL, 0xbebe000000bebebeULL, 0xbcbc000000bcbcbcULL,
170 0x8b8b0000008b8b8bULL, 0x1616000000161616ULL, 0x3434000000343434ULL,
171 0x4d4d0000004d4d4dULL, 0xc3c3000000c3c3c3ULL, 0x7272000000727272ULL,
172 0x9595000000959595ULL, 0xabab000000abababULL, 0x8e8e0000008e8e8eULL,
173 0xbaba000000bababaULL, 0x7a7a0000007a7a7aULL, 0xb3b3000000b3b3b3ULL,
174 0x0202000000020202ULL, 0xb4b4000000b4b4b4ULL, 0xadad000000adadadULL,
175 0xa2a2000000a2a2a2ULL, 0xacac000000acacacULL, 0xd8d8000000d8d8d8ULL,
176 0x9a9a0000009a9a9aULL, 0x1717000000171717ULL, 0x1a1a0000001a1a1aULL,
177 0x3535000000353535ULL, 0xcccc000000ccccccULL, 0xf7f7000000f7f7f7ULL,
178 0x9999000000999999ULL, 0x6161000000616161ULL, 0x5a5a0000005a5a5aULL,
179 0xe8e8000000e8e8e8ULL, 0x2424000000242424ULL, 0x5656000000565656ULL,
180 0x4040000000404040ULL, 0xe1e1000000e1e1e1ULL, 0x6363000000636363ULL,
181 0x0909000000090909ULL, 0x3333000000333333ULL, 0xbfbf000000bfbfbfULL,
182 0x9898000000989898ULL, 0x9797000000979797ULL, 0x8585000000858585ULL,
183 0x6868000000686868ULL, 0xfcfc000000fcfcfcULL, 0xecec000000ecececULL,
184 0x0a0a0000000a0a0aULL, 0xdada000000dadadaULL, 0x6f6f0000006f6f6fULL,
185 0x5353000000535353ULL, 0x6262000000626262ULL, 0xa3a3000000a3a3a3ULL,
186 0x2e2e0000002e2e2eULL, 0x0808000000080808ULL, 0xafaf000000afafafULL,
187 0x2828000000282828ULL, 0xb0b0000000b0b0b0ULL, 0x7474000000747474ULL,
188 0xc2c2000000c2c2c2ULL, 0xbdbd000000bdbdbdULL, 0x3636000000363636ULL,
189 0x2222000000222222ULL, 0x3838000000383838ULL, 0x6464000000646464ULL,
190 0x1e1e0000001e1e1eULL, 0x3939000000393939ULL, 0x2c2c0000002c2c2cULL,
191 0xa6a6000000a6a6a6ULL, 0x3030000000303030ULL, 0xe5e5000000e5e5e5ULL,
192 0x4444000000444444ULL, 0xfdfd000000fdfdfdULL, 0x8888000000888888ULL,
193 0x9f9f0000009f9f9fULL, 0x6565000000656565ULL, 0x8787000000878787ULL,
194 0x6b6b0000006b6b6bULL, 0xf4f4000000f4f4f4ULL, 0x2323000000232323ULL,
195 0x4848000000484848ULL, 0x1010000000101010ULL, 0xd1d1000000d1d1d1ULL,
196 0x5151000000515151ULL, 0xc0c0000000c0c0c0ULL, 0xf9f9000000f9f9f9ULL,
197 0xd2d2000000d2d2d2ULL, 0xa0a0000000a0a0a0ULL, 0x5555000000555555ULL,
198 0xa1a1000000a1a1a1ULL, 0x4141000000414141ULL, 0xfafa000000fafafaULL,
199 0x4343000000434343ULL, 0x1313000000131313ULL, 0xc4c4000000c4c4c4ULL,
200 0x2f2f0000002f2f2fULL, 0xa8a8000000a8a8a8ULL, 0xb6b6000000b6b6b6ULL,
201 0x3c3c0000003c3c3cULL, 0x2b2b0000002b2b2bULL, 0xc1c1000000c1c1c1ULL,
202 0xffff000000ffffffULL, 0xc8c8000000c8c8c8ULL, 0xa5a5000000a5a5a5ULL,
203 0x2020000000202020ULL, 0x8989000000898989ULL, 0x0000000000000000ULL,
204 0x9090000000909090ULL, 0x4747000000474747ULL, 0xefef000000efefefULL,
205 0xeaea000000eaeaeaULL, 0xb7b7000000b7b7b7ULL, 0x1515000000151515ULL,
206 0x0606000000060606ULL, 0xcdcd000000cdcdcdULL, 0xb5b5000000b5b5b5ULL,
207 0x1212000000121212ULL, 0x7e7e0000007e7e7eULL, 0xbbbb000000bbbbbbULL,
208 0x2929000000292929ULL, 0x0f0f0000000f0f0fULL, 0xb8b8000000b8b8b8ULL,
209 0x0707000000070707ULL, 0x0404000000040404ULL, 0x9b9b0000009b9b9bULL,
210 0x9494000000949494ULL, 0x2121000000212121ULL, 0x6666000000666666ULL,
211 0xe6e6000000e6e6e6ULL, 0xcece000000cececeULL, 0xeded000000edededULL,
212 0xe7e7000000e7e7e7ULL, 0x3b3b0000003b3b3bULL, 0xfefe000000fefefeULL,
213 0x7f7f0000007f7f7fULL, 0xc5c5000000c5c5c5ULL, 0xa4a4000000a4a4a4ULL,
214 0x3737000000373737ULL, 0xb1b1000000b1b1b1ULL, 0x4c4c0000004c4c4cULL,
215 0x9191000000919191ULL, 0x6e6e0000006e6e6eULL, 0x8d8d0000008d8d8dULL,
216 0x7676000000767676ULL, 0x0303000000030303ULL, 0x2d2d0000002d2d2dULL,
217 0xdede000000dededeULL, 0x9696000000969696ULL, 0x2626000000262626ULL,
218 0x7d7d0000007d7d7dULL, 0xc6c6000000c6c6c6ULL, 0x5c5c0000005c5c5cULL,
219 0xd3d3000000d3d3d3ULL, 0xf2f2000000f2f2f2ULL, 0x4f4f0000004f4f4fULL,
220 0x1919000000191919ULL, 0x3f3f0000003f3f3fULL, 0xdcdc000000dcdcdcULL,
221 0x7979000000797979ULL, 0x1d1d0000001d1d1dULL, 0x5252000000525252ULL,
222 0xebeb000000ebebebULL, 0xf3f3000000f3f3f3ULL, 0x6d6d0000006d6d6dULL,
223 0x5e5e0000005e5e5eULL, 0xfbfb000000fbfbfbULL, 0x6969000000696969ULL,
224 0xb2b2000000b2b2b2ULL, 0xf0f0000000f0f0f0ULL, 0x3131000000313131ULL,
225 0x0c0c0000000c0c0cULL, 0xd4d4000000d4d4d4ULL, 0xcfcf000000cfcfcfULL,
226 0x8c8c0000008c8c8cULL, 0xe2e2000000e2e2e2ULL, 0x7575000000757575ULL,
227 0xa9a9000000a9a9a9ULL, 0x4a4a0000004a4a4aULL, 0x5757000000575757ULL,
228 0x8484000000848484ULL, 0x1111000000111111ULL, 0x4545000000454545ULL,
229 0x1b1b0000001b1b1bULL, 0xf5f5000000f5f5f5ULL, 0xe4e4000000e4e4e4ULL,
230 0x0e0e0000000e0e0eULL, 0x7373000000737373ULL, 0xaaaa000000aaaaaaULL,
231 0xf1f1000000f1f1f1ULL, 0xdddd000000ddddddULL, 0x5959000000595959ULL,
232 0x1414000000141414ULL, 0x6c6c0000006c6c6cULL, 0x9292000000929292ULL,
233 0x5454000000545454ULL, 0xd0d0000000d0d0d0ULL, 0x7878000000787878ULL,
234 0x7070000000707070ULL, 0xe3e3000000e3e3e3ULL, 0x4949000000494949ULL,
235 0x8080000000808080ULL, 0x5050000000505050ULL, 0xa7a7000000a7a7a7ULL,
236 0xf6f6000000f6f6f6ULL, 0x7777000000777777ULL, 0x9393000000939393ULL,
237 0x8686000000868686ULL, 0x8383000000838383ULL, 0x2a2a0000002a2a2aULL,
238 0xc7c7000000c7c7c7ULL, 0x5b5b0000005b5b5bULL, 0xe9e9000000e9e9e9ULL,
239 0xeeee000000eeeeeeULL, 0x8f8f0000008f8f8fULL, 0x0101000000010101ULL,
240 0x3d3d0000003d3d3dULL,
241};
242
243const u64 camellia_sp03303033[256] = {
244 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
245 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
246 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
247 0x00c2c200c200c2c2ULL, 0x00abab00ab00ababULL, 0x009a9a009a009a9aULL,
248 0x0075750075007575ULL, 0x0006060006000606ULL, 0x0057570057005757ULL,
249 0x00a0a000a000a0a0ULL, 0x0091910091009191ULL, 0x00f7f700f700f7f7ULL,
250 0x00b5b500b500b5b5ULL, 0x00c9c900c900c9c9ULL, 0x00a2a200a200a2a2ULL,
251 0x008c8c008c008c8cULL, 0x00d2d200d200d2d2ULL, 0x0090900090009090ULL,
252 0x00f6f600f600f6f6ULL, 0x0007070007000707ULL, 0x00a7a700a700a7a7ULL,
253 0x0027270027002727ULL, 0x008e8e008e008e8eULL, 0x00b2b200b200b2b2ULL,
254 0x0049490049004949ULL, 0x00dede00de00dedeULL, 0x0043430043004343ULL,
255 0x005c5c005c005c5cULL, 0x00d7d700d700d7d7ULL, 0x00c7c700c700c7c7ULL,
256 0x003e3e003e003e3eULL, 0x00f5f500f500f5f5ULL, 0x008f8f008f008f8fULL,
257 0x0067670067006767ULL, 0x001f1f001f001f1fULL, 0x0018180018001818ULL,
258 0x006e6e006e006e6eULL, 0x00afaf00af00afafULL, 0x002f2f002f002f2fULL,
259 0x00e2e200e200e2e2ULL, 0x0085850085008585ULL, 0x000d0d000d000d0dULL,
260 0x0053530053005353ULL, 0x00f0f000f000f0f0ULL, 0x009c9c009c009c9cULL,
261 0x0065650065006565ULL, 0x00eaea00ea00eaeaULL, 0x00a3a300a300a3a3ULL,
262 0x00aeae00ae00aeaeULL, 0x009e9e009e009e9eULL, 0x00ecec00ec00ececULL,
263 0x0080800080008080ULL, 0x002d2d002d002d2dULL, 0x006b6b006b006b6bULL,
264 0x00a8a800a800a8a8ULL, 0x002b2b002b002b2bULL, 0x0036360036003636ULL,
265 0x00a6a600a600a6a6ULL, 0x00c5c500c500c5c5ULL, 0x0086860086008686ULL,
266 0x004d4d004d004d4dULL, 0x0033330033003333ULL, 0x00fdfd00fd00fdfdULL,
267 0x0066660066006666ULL, 0x0058580058005858ULL, 0x0096960096009696ULL,
268 0x003a3a003a003a3aULL, 0x0009090009000909ULL, 0x0095950095009595ULL,
269 0x0010100010001010ULL, 0x0078780078007878ULL, 0x00d8d800d800d8d8ULL,
270 0x0042420042004242ULL, 0x00cccc00cc00ccccULL, 0x00efef00ef00efefULL,
271 0x0026260026002626ULL, 0x00e5e500e500e5e5ULL, 0x0061610061006161ULL,
272 0x001a1a001a001a1aULL, 0x003f3f003f003f3fULL, 0x003b3b003b003b3bULL,
273 0x0082820082008282ULL, 0x00b6b600b600b6b6ULL, 0x00dbdb00db00dbdbULL,
274 0x00d4d400d400d4d4ULL, 0x0098980098009898ULL, 0x00e8e800e800e8e8ULL,
275 0x008b8b008b008b8bULL, 0x0002020002000202ULL, 0x00ebeb00eb00ebebULL,
276 0x000a0a000a000a0aULL, 0x002c2c002c002c2cULL, 0x001d1d001d001d1dULL,
277 0x00b0b000b000b0b0ULL, 0x006f6f006f006f6fULL, 0x008d8d008d008d8dULL,
278 0x0088880088008888ULL, 0x000e0e000e000e0eULL, 0x0019190019001919ULL,
279 0x0087870087008787ULL, 0x004e4e004e004e4eULL, 0x000b0b000b000b0bULL,
280 0x00a9a900a900a9a9ULL, 0x000c0c000c000c0cULL, 0x0079790079007979ULL,
281 0x0011110011001111ULL, 0x007f7f007f007f7fULL, 0x0022220022002222ULL,
282 0x00e7e700e700e7e7ULL, 0x0059590059005959ULL, 0x00e1e100e100e1e1ULL,
283 0x00dada00da00dadaULL, 0x003d3d003d003d3dULL, 0x00c8c800c800c8c8ULL,
284 0x0012120012001212ULL, 0x0004040004000404ULL, 0x0074740074007474ULL,
285 0x0054540054005454ULL, 0x0030300030003030ULL, 0x007e7e007e007e7eULL,
286 0x00b4b400b400b4b4ULL, 0x0028280028002828ULL, 0x0055550055005555ULL,
287 0x0068680068006868ULL, 0x0050500050005050ULL, 0x00bebe00be00bebeULL,
288 0x00d0d000d000d0d0ULL, 0x00c4c400c400c4c4ULL, 0x0031310031003131ULL,
289 0x00cbcb00cb00cbcbULL, 0x002a2a002a002a2aULL, 0x00adad00ad00adadULL,
290 0x000f0f000f000f0fULL, 0x00caca00ca00cacaULL, 0x0070700070007070ULL,
291 0x00ffff00ff00ffffULL, 0x0032320032003232ULL, 0x0069690069006969ULL,
292 0x0008080008000808ULL, 0x0062620062006262ULL, 0x0000000000000000ULL,
293 0x0024240024002424ULL, 0x00d1d100d100d1d1ULL, 0x00fbfb00fb00fbfbULL,
294 0x00baba00ba00babaULL, 0x00eded00ed00ededULL, 0x0045450045004545ULL,
295 0x0081810081008181ULL, 0x0073730073007373ULL, 0x006d6d006d006d6dULL,
296 0x0084840084008484ULL, 0x009f9f009f009f9fULL, 0x00eeee00ee00eeeeULL,
297 0x004a4a004a004a4aULL, 0x00c3c300c300c3c3ULL, 0x002e2e002e002e2eULL,
298 0x00c1c100c100c1c1ULL, 0x0001010001000101ULL, 0x00e6e600e600e6e6ULL,
299 0x0025250025002525ULL, 0x0048480048004848ULL, 0x0099990099009999ULL,
300 0x00b9b900b900b9b9ULL, 0x00b3b300b300b3b3ULL, 0x007b7b007b007b7bULL,
301 0x00f9f900f900f9f9ULL, 0x00cece00ce00ceceULL, 0x00bfbf00bf00bfbfULL,
302 0x00dfdf00df00dfdfULL, 0x0071710071007171ULL, 0x0029290029002929ULL,
303 0x00cdcd00cd00cdcdULL, 0x006c6c006c006c6cULL, 0x0013130013001313ULL,
304 0x0064640064006464ULL, 0x009b9b009b009b9bULL, 0x0063630063006363ULL,
305 0x009d9d009d009d9dULL, 0x00c0c000c000c0c0ULL, 0x004b4b004b004b4bULL,
306 0x00b7b700b700b7b7ULL, 0x00a5a500a500a5a5ULL, 0x0089890089008989ULL,
307 0x005f5f005f005f5fULL, 0x00b1b100b100b1b1ULL, 0x0017170017001717ULL,
308 0x00f4f400f400f4f4ULL, 0x00bcbc00bc00bcbcULL, 0x00d3d300d300d3d3ULL,
309 0x0046460046004646ULL, 0x00cfcf00cf00cfcfULL, 0x0037370037003737ULL,
310 0x005e5e005e005e5eULL, 0x0047470047004747ULL, 0x0094940094009494ULL,
311 0x00fafa00fa00fafaULL, 0x00fcfc00fc00fcfcULL, 0x005b5b005b005b5bULL,
312 0x0097970097009797ULL, 0x00fefe00fe00fefeULL, 0x005a5a005a005a5aULL,
313 0x00acac00ac00acacULL, 0x003c3c003c003c3cULL, 0x004c4c004c004c4cULL,
314 0x0003030003000303ULL, 0x0035350035003535ULL, 0x00f3f300f300f3f3ULL,
315 0x0023230023002323ULL, 0x00b8b800b800b8b8ULL, 0x005d5d005d005d5dULL,
316 0x006a6a006a006a6aULL, 0x0092920092009292ULL, 0x00d5d500d500d5d5ULL,
317 0x0021210021002121ULL, 0x0044440044004444ULL, 0x0051510051005151ULL,
318 0x00c6c600c600c6c6ULL, 0x007d7d007d007d7dULL, 0x0039390039003939ULL,
319 0x0083830083008383ULL, 0x00dcdc00dc00dcdcULL, 0x00aaaa00aa00aaaaULL,
320 0x007c7c007c007c7cULL, 0x0077770077007777ULL, 0x0056560056005656ULL,
321 0x0005050005000505ULL, 0x001b1b001b001b1bULL, 0x00a4a400a400a4a4ULL,
322 0x0015150015001515ULL, 0x0034340034003434ULL, 0x001e1e001e001e1eULL,
323 0x001c1c001c001c1cULL, 0x00f8f800f800f8f8ULL, 0x0052520052005252ULL,
324 0x0020200020002020ULL, 0x0014140014001414ULL, 0x00e9e900e900e9e9ULL,
325 0x00bdbd00bd00bdbdULL, 0x00dddd00dd00ddddULL, 0x00e4e400e400e4e4ULL,
326 0x00a1a100a100a1a1ULL, 0x00e0e000e000e0e0ULL, 0x008a8a008a008a8aULL,
327 0x00f1f100f100f1f1ULL, 0x00d6d600d600d6d6ULL, 0x007a7a007a007a7aULL,
328 0x00bbbb00bb00bbbbULL, 0x00e3e300e300e3e3ULL, 0x0040400040004040ULL,
329 0x004f4f004f004f4fULL,
330};
331
332const u64 camellia_sp00444404[256] = {
333 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
334 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
335 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
336 0x00006b6b6b6b006bULL, 0x0000454545450045ULL, 0x0000a5a5a5a500a5ULL,
337 0x0000edededed00edULL, 0x00004f4f4f4f004fULL, 0x00001d1d1d1d001dULL,
338 0x0000929292920092ULL, 0x0000868686860086ULL, 0x0000afafafaf00afULL,
339 0x00007c7c7c7c007cULL, 0x00001f1f1f1f001fULL, 0x00003e3e3e3e003eULL,
340 0x0000dcdcdcdc00dcULL, 0x00005e5e5e5e005eULL, 0x00000b0b0b0b000bULL,
341 0x0000a6a6a6a600a6ULL, 0x0000393939390039ULL, 0x0000d5d5d5d500d5ULL,
342 0x00005d5d5d5d005dULL, 0x0000d9d9d9d900d9ULL, 0x00005a5a5a5a005aULL,
343 0x0000515151510051ULL, 0x00006c6c6c6c006cULL, 0x00008b8b8b8b008bULL,
344 0x00009a9a9a9a009aULL, 0x0000fbfbfbfb00fbULL, 0x0000b0b0b0b000b0ULL,
345 0x0000747474740074ULL, 0x00002b2b2b2b002bULL, 0x0000f0f0f0f000f0ULL,
346 0x0000848484840084ULL, 0x0000dfdfdfdf00dfULL, 0x0000cbcbcbcb00cbULL,
347 0x0000343434340034ULL, 0x0000767676760076ULL, 0x00006d6d6d6d006dULL,
348 0x0000a9a9a9a900a9ULL, 0x0000d1d1d1d100d1ULL, 0x0000040404040004ULL,
349 0x0000141414140014ULL, 0x00003a3a3a3a003aULL, 0x0000dededede00deULL,
350 0x0000111111110011ULL, 0x0000323232320032ULL, 0x00009c9c9c9c009cULL,
351 0x0000535353530053ULL, 0x0000f2f2f2f200f2ULL, 0x0000fefefefe00feULL,
352 0x0000cfcfcfcf00cfULL, 0x0000c3c3c3c300c3ULL, 0x00007a7a7a7a007aULL,
353 0x0000242424240024ULL, 0x0000e8e8e8e800e8ULL, 0x0000606060600060ULL,
354 0x0000696969690069ULL, 0x0000aaaaaaaa00aaULL, 0x0000a0a0a0a000a0ULL,
355 0x0000a1a1a1a100a1ULL, 0x0000626262620062ULL, 0x0000545454540054ULL,
356 0x00001e1e1e1e001eULL, 0x0000e0e0e0e000e0ULL, 0x0000646464640064ULL,
357 0x0000101010100010ULL, 0x0000000000000000ULL, 0x0000a3a3a3a300a3ULL,
358 0x0000757575750075ULL, 0x00008a8a8a8a008aULL, 0x0000e6e6e6e600e6ULL,
359 0x0000090909090009ULL, 0x0000dddddddd00ddULL, 0x0000878787870087ULL,
360 0x0000838383830083ULL, 0x0000cdcdcdcd00cdULL, 0x0000909090900090ULL,
361 0x0000737373730073ULL, 0x0000f6f6f6f600f6ULL, 0x00009d9d9d9d009dULL,
362 0x0000bfbfbfbf00bfULL, 0x0000525252520052ULL, 0x0000d8d8d8d800d8ULL,
363 0x0000c8c8c8c800c8ULL, 0x0000c6c6c6c600c6ULL, 0x0000818181810081ULL,
364 0x00006f6f6f6f006fULL, 0x0000131313130013ULL, 0x0000636363630063ULL,
365 0x0000e9e9e9e900e9ULL, 0x0000a7a7a7a700a7ULL, 0x00009f9f9f9f009fULL,
366 0x0000bcbcbcbc00bcULL, 0x0000292929290029ULL, 0x0000f9f9f9f900f9ULL,
367 0x00002f2f2f2f002fULL, 0x0000b4b4b4b400b4ULL, 0x0000787878780078ULL,
368 0x0000060606060006ULL, 0x0000e7e7e7e700e7ULL, 0x0000717171710071ULL,
369 0x0000d4d4d4d400d4ULL, 0x0000abababab00abULL, 0x0000888888880088ULL,
370 0x00008d8d8d8d008dULL, 0x0000727272720072ULL, 0x0000b9b9b9b900b9ULL,
371 0x0000f8f8f8f800f8ULL, 0x0000acacacac00acULL, 0x0000363636360036ULL,
372 0x00002a2a2a2a002aULL, 0x00003c3c3c3c003cULL, 0x0000f1f1f1f100f1ULL,
373 0x0000404040400040ULL, 0x0000d3d3d3d300d3ULL, 0x0000bbbbbbbb00bbULL,
374 0x0000434343430043ULL, 0x0000151515150015ULL, 0x0000adadadad00adULL,
375 0x0000777777770077ULL, 0x0000808080800080ULL, 0x0000828282820082ULL,
376 0x0000ecececec00ecULL, 0x0000272727270027ULL, 0x0000e5e5e5e500e5ULL,
377 0x0000858585850085ULL, 0x0000353535350035ULL, 0x00000c0c0c0c000cULL,
378 0x0000414141410041ULL, 0x0000efefefef00efULL, 0x0000939393930093ULL,
379 0x0000191919190019ULL, 0x0000212121210021ULL, 0x00000e0e0e0e000eULL,
380 0x00004e4e4e4e004eULL, 0x0000656565650065ULL, 0x0000bdbdbdbd00bdULL,
381 0x0000b8b8b8b800b8ULL, 0x00008f8f8f8f008fULL, 0x0000ebebebeb00ebULL,
382 0x0000cececece00ceULL, 0x0000303030300030ULL, 0x00005f5f5f5f005fULL,
383 0x0000c5c5c5c500c5ULL, 0x00001a1a1a1a001aULL, 0x0000e1e1e1e100e1ULL,
384 0x0000cacacaca00caULL, 0x0000474747470047ULL, 0x00003d3d3d3d003dULL,
385 0x0000010101010001ULL, 0x0000d6d6d6d600d6ULL, 0x0000565656560056ULL,
386 0x00004d4d4d4d004dULL, 0x00000d0d0d0d000dULL, 0x0000666666660066ULL,
387 0x0000cccccccc00ccULL, 0x00002d2d2d2d002dULL, 0x0000121212120012ULL,
388 0x0000202020200020ULL, 0x0000b1b1b1b100b1ULL, 0x0000999999990099ULL,
389 0x00004c4c4c4c004cULL, 0x0000c2c2c2c200c2ULL, 0x00007e7e7e7e007eULL,
390 0x0000050505050005ULL, 0x0000b7b7b7b700b7ULL, 0x0000313131310031ULL,
391 0x0000171717170017ULL, 0x0000d7d7d7d700d7ULL, 0x0000585858580058ULL,
392 0x0000616161610061ULL, 0x00001b1b1b1b001bULL, 0x00001c1c1c1c001cULL,
393 0x00000f0f0f0f000fULL, 0x0000161616160016ULL, 0x0000181818180018ULL,
394 0x0000222222220022ULL, 0x0000444444440044ULL, 0x0000b2b2b2b200b2ULL,
395 0x0000b5b5b5b500b5ULL, 0x0000919191910091ULL, 0x0000080808080008ULL,
396 0x0000a8a8a8a800a8ULL, 0x0000fcfcfcfc00fcULL, 0x0000505050500050ULL,
397 0x0000d0d0d0d000d0ULL, 0x00007d7d7d7d007dULL, 0x0000898989890089ULL,
398 0x0000979797970097ULL, 0x00005b5b5b5b005bULL, 0x0000959595950095ULL,
399 0x0000ffffffff00ffULL, 0x0000d2d2d2d200d2ULL, 0x0000c4c4c4c400c4ULL,
400 0x0000484848480048ULL, 0x0000f7f7f7f700f7ULL, 0x0000dbdbdbdb00dbULL,
401 0x0000030303030003ULL, 0x0000dadadada00daULL, 0x00003f3f3f3f003fULL,
402 0x0000949494940094ULL, 0x00005c5c5c5c005cULL, 0x0000020202020002ULL,
403 0x00004a4a4a4a004aULL, 0x0000333333330033ULL, 0x0000676767670067ULL,
404 0x0000f3f3f3f300f3ULL, 0x00007f7f7f7f007fULL, 0x0000e2e2e2e200e2ULL,
405 0x00009b9b9b9b009bULL, 0x0000262626260026ULL, 0x0000373737370037ULL,
406 0x00003b3b3b3b003bULL, 0x0000969696960096ULL, 0x00004b4b4b4b004bULL,
407 0x0000bebebebe00beULL, 0x00002e2e2e2e002eULL, 0x0000797979790079ULL,
408 0x00008c8c8c8c008cULL, 0x00006e6e6e6e006eULL, 0x00008e8e8e8e008eULL,
409 0x0000f5f5f5f500f5ULL, 0x0000b6b6b6b600b6ULL, 0x0000fdfdfdfd00fdULL,
410 0x0000595959590059ULL, 0x0000989898980098ULL, 0x00006a6a6a6a006aULL,
411 0x0000464646460046ULL, 0x0000babababa00baULL, 0x0000252525250025ULL,
412 0x0000424242420042ULL, 0x0000a2a2a2a200a2ULL, 0x0000fafafafa00faULL,
413 0x0000070707070007ULL, 0x0000555555550055ULL, 0x0000eeeeeeee00eeULL,
414 0x00000a0a0a0a000aULL, 0x0000494949490049ULL, 0x0000686868680068ULL,
415 0x0000383838380038ULL, 0x0000a4a4a4a400a4ULL, 0x0000282828280028ULL,
416 0x00007b7b7b7b007bULL, 0x0000c9c9c9c900c9ULL, 0x0000c1c1c1c100c1ULL,
417 0x0000e3e3e3e300e3ULL, 0x0000f4f4f4f400f4ULL, 0x0000c7c7c7c700c7ULL,
418 0x00009e9e9e9e009eULL,
419};
420
421const u64 camellia_sp02220222[256] = {
422 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
423 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
424 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
425 0x000b0b0b000b0b0bULL, 0x00aeaeae00aeaeaeULL, 0x006a6a6a006a6a6aULL,
426 0x00d5d5d500d5d5d5ULL, 0x0018181800181818ULL, 0x005d5d5d005d5d5dULL,
427 0x0082828200828282ULL, 0x0046464600464646ULL, 0x00dfdfdf00dfdfdfULL,
428 0x00d6d6d600d6d6d6ULL, 0x0027272700272727ULL, 0x008a8a8a008a8a8aULL,
429 0x0032323200323232ULL, 0x004b4b4b004b4b4bULL, 0x0042424200424242ULL,
430 0x00dbdbdb00dbdbdbULL, 0x001c1c1c001c1c1cULL, 0x009e9e9e009e9e9eULL,
431 0x009c9c9c009c9c9cULL, 0x003a3a3a003a3a3aULL, 0x00cacaca00cacacaULL,
432 0x0025252500252525ULL, 0x007b7b7b007b7b7bULL, 0x000d0d0d000d0d0dULL,
433 0x0071717100717171ULL, 0x005f5f5f005f5f5fULL, 0x001f1f1f001f1f1fULL,
434 0x00f8f8f800f8f8f8ULL, 0x00d7d7d700d7d7d7ULL, 0x003e3e3e003e3e3eULL,
435 0x009d9d9d009d9d9dULL, 0x007c7c7c007c7c7cULL, 0x0060606000606060ULL,
436 0x00b9b9b900b9b9b9ULL, 0x00bebebe00bebebeULL, 0x00bcbcbc00bcbcbcULL,
437 0x008b8b8b008b8b8bULL, 0x0016161600161616ULL, 0x0034343400343434ULL,
438 0x004d4d4d004d4d4dULL, 0x00c3c3c300c3c3c3ULL, 0x0072727200727272ULL,
439 0x0095959500959595ULL, 0x00ababab00abababULL, 0x008e8e8e008e8e8eULL,
440 0x00bababa00bababaULL, 0x007a7a7a007a7a7aULL, 0x00b3b3b300b3b3b3ULL,
441 0x0002020200020202ULL, 0x00b4b4b400b4b4b4ULL, 0x00adadad00adadadULL,
442 0x00a2a2a200a2a2a2ULL, 0x00acacac00acacacULL, 0x00d8d8d800d8d8d8ULL,
443 0x009a9a9a009a9a9aULL, 0x0017171700171717ULL, 0x001a1a1a001a1a1aULL,
444 0x0035353500353535ULL, 0x00cccccc00ccccccULL, 0x00f7f7f700f7f7f7ULL,
445 0x0099999900999999ULL, 0x0061616100616161ULL, 0x005a5a5a005a5a5aULL,
446 0x00e8e8e800e8e8e8ULL, 0x0024242400242424ULL, 0x0056565600565656ULL,
447 0x0040404000404040ULL, 0x00e1e1e100e1e1e1ULL, 0x0063636300636363ULL,
448 0x0009090900090909ULL, 0x0033333300333333ULL, 0x00bfbfbf00bfbfbfULL,
449 0x0098989800989898ULL, 0x0097979700979797ULL, 0x0085858500858585ULL,
450 0x0068686800686868ULL, 0x00fcfcfc00fcfcfcULL, 0x00ececec00ecececULL,
451 0x000a0a0a000a0a0aULL, 0x00dadada00dadadaULL, 0x006f6f6f006f6f6fULL,
452 0x0053535300535353ULL, 0x0062626200626262ULL, 0x00a3a3a300a3a3a3ULL,
453 0x002e2e2e002e2e2eULL, 0x0008080800080808ULL, 0x00afafaf00afafafULL,
454 0x0028282800282828ULL, 0x00b0b0b000b0b0b0ULL, 0x0074747400747474ULL,
455 0x00c2c2c200c2c2c2ULL, 0x00bdbdbd00bdbdbdULL, 0x0036363600363636ULL,
456 0x0022222200222222ULL, 0x0038383800383838ULL, 0x0064646400646464ULL,
457 0x001e1e1e001e1e1eULL, 0x0039393900393939ULL, 0x002c2c2c002c2c2cULL,
458 0x00a6a6a600a6a6a6ULL, 0x0030303000303030ULL, 0x00e5e5e500e5e5e5ULL,
459 0x0044444400444444ULL, 0x00fdfdfd00fdfdfdULL, 0x0088888800888888ULL,
460 0x009f9f9f009f9f9fULL, 0x0065656500656565ULL, 0x0087878700878787ULL,
461 0x006b6b6b006b6b6bULL, 0x00f4f4f400f4f4f4ULL, 0x0023232300232323ULL,
462 0x0048484800484848ULL, 0x0010101000101010ULL, 0x00d1d1d100d1d1d1ULL,
463 0x0051515100515151ULL, 0x00c0c0c000c0c0c0ULL, 0x00f9f9f900f9f9f9ULL,
464 0x00d2d2d200d2d2d2ULL, 0x00a0a0a000a0a0a0ULL, 0x0055555500555555ULL,
465 0x00a1a1a100a1a1a1ULL, 0x0041414100414141ULL, 0x00fafafa00fafafaULL,
466 0x0043434300434343ULL, 0x0013131300131313ULL, 0x00c4c4c400c4c4c4ULL,
467 0x002f2f2f002f2f2fULL, 0x00a8a8a800a8a8a8ULL, 0x00b6b6b600b6b6b6ULL,
468 0x003c3c3c003c3c3cULL, 0x002b2b2b002b2b2bULL, 0x00c1c1c100c1c1c1ULL,
469 0x00ffffff00ffffffULL, 0x00c8c8c800c8c8c8ULL, 0x00a5a5a500a5a5a5ULL,
470 0x0020202000202020ULL, 0x0089898900898989ULL, 0x0000000000000000ULL,
471 0x0090909000909090ULL, 0x0047474700474747ULL, 0x00efefef00efefefULL,
472 0x00eaeaea00eaeaeaULL, 0x00b7b7b700b7b7b7ULL, 0x0015151500151515ULL,
473 0x0006060600060606ULL, 0x00cdcdcd00cdcdcdULL, 0x00b5b5b500b5b5b5ULL,
474 0x0012121200121212ULL, 0x007e7e7e007e7e7eULL, 0x00bbbbbb00bbbbbbULL,
475 0x0029292900292929ULL, 0x000f0f0f000f0f0fULL, 0x00b8b8b800b8b8b8ULL,
476 0x0007070700070707ULL, 0x0004040400040404ULL, 0x009b9b9b009b9b9bULL,
477 0x0094949400949494ULL, 0x0021212100212121ULL, 0x0066666600666666ULL,
478 0x00e6e6e600e6e6e6ULL, 0x00cecece00cececeULL, 0x00ededed00edededULL,
479 0x00e7e7e700e7e7e7ULL, 0x003b3b3b003b3b3bULL, 0x00fefefe00fefefeULL,
480 0x007f7f7f007f7f7fULL, 0x00c5c5c500c5c5c5ULL, 0x00a4a4a400a4a4a4ULL,
481 0x0037373700373737ULL, 0x00b1b1b100b1b1b1ULL, 0x004c4c4c004c4c4cULL,
482 0x0091919100919191ULL, 0x006e6e6e006e6e6eULL, 0x008d8d8d008d8d8dULL,
483 0x0076767600767676ULL, 0x0003030300030303ULL, 0x002d2d2d002d2d2dULL,
484 0x00dedede00dededeULL, 0x0096969600969696ULL, 0x0026262600262626ULL,
485 0x007d7d7d007d7d7dULL, 0x00c6c6c600c6c6c6ULL, 0x005c5c5c005c5c5cULL,
486 0x00d3d3d300d3d3d3ULL, 0x00f2f2f200f2f2f2ULL, 0x004f4f4f004f4f4fULL,
487 0x0019191900191919ULL, 0x003f3f3f003f3f3fULL, 0x00dcdcdc00dcdcdcULL,
488 0x0079797900797979ULL, 0x001d1d1d001d1d1dULL, 0x0052525200525252ULL,
489 0x00ebebeb00ebebebULL, 0x00f3f3f300f3f3f3ULL, 0x006d6d6d006d6d6dULL,
490 0x005e5e5e005e5e5eULL, 0x00fbfbfb00fbfbfbULL, 0x0069696900696969ULL,
491 0x00b2b2b200b2b2b2ULL, 0x00f0f0f000f0f0f0ULL, 0x0031313100313131ULL,
492 0x000c0c0c000c0c0cULL, 0x00d4d4d400d4d4d4ULL, 0x00cfcfcf00cfcfcfULL,
493 0x008c8c8c008c8c8cULL, 0x00e2e2e200e2e2e2ULL, 0x0075757500757575ULL,
494 0x00a9a9a900a9a9a9ULL, 0x004a4a4a004a4a4aULL, 0x0057575700575757ULL,
495 0x0084848400848484ULL, 0x0011111100111111ULL, 0x0045454500454545ULL,
496 0x001b1b1b001b1b1bULL, 0x00f5f5f500f5f5f5ULL, 0x00e4e4e400e4e4e4ULL,
497 0x000e0e0e000e0e0eULL, 0x0073737300737373ULL, 0x00aaaaaa00aaaaaaULL,
498 0x00f1f1f100f1f1f1ULL, 0x00dddddd00ddddddULL, 0x0059595900595959ULL,
499 0x0014141400141414ULL, 0x006c6c6c006c6c6cULL, 0x0092929200929292ULL,
500 0x0054545400545454ULL, 0x00d0d0d000d0d0d0ULL, 0x0078787800787878ULL,
501 0x0070707000707070ULL, 0x00e3e3e300e3e3e3ULL, 0x0049494900494949ULL,
502 0x0080808000808080ULL, 0x0050505000505050ULL, 0x00a7a7a700a7a7a7ULL,
503 0x00f6f6f600f6f6f6ULL, 0x0077777700777777ULL, 0x0093939300939393ULL,
504 0x0086868600868686ULL, 0x0083838300838383ULL, 0x002a2a2a002a2a2aULL,
505 0x00c7c7c700c7c7c7ULL, 0x005b5b5b005b5b5bULL, 0x00e9e9e900e9e9e9ULL,
506 0x00eeeeee00eeeeeeULL, 0x008f8f8f008f8f8fULL, 0x0001010100010101ULL,
507 0x003d3d3d003d3d3dULL,
508};
509
510const u64 camellia_sp30333033[256] = {
511 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
512 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
513 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
514 0xc200c2c2c200c2c2ULL, 0xab00ababab00ababULL, 0x9a009a9a9a009a9aULL,
515 0x7500757575007575ULL, 0x0600060606000606ULL, 0x5700575757005757ULL,
516 0xa000a0a0a000a0a0ULL, 0x9100919191009191ULL, 0xf700f7f7f700f7f7ULL,
517 0xb500b5b5b500b5b5ULL, 0xc900c9c9c900c9c9ULL, 0xa200a2a2a200a2a2ULL,
518 0x8c008c8c8c008c8cULL, 0xd200d2d2d200d2d2ULL, 0x9000909090009090ULL,
519 0xf600f6f6f600f6f6ULL, 0x0700070707000707ULL, 0xa700a7a7a700a7a7ULL,
520 0x2700272727002727ULL, 0x8e008e8e8e008e8eULL, 0xb200b2b2b200b2b2ULL,
521 0x4900494949004949ULL, 0xde00dedede00dedeULL, 0x4300434343004343ULL,
522 0x5c005c5c5c005c5cULL, 0xd700d7d7d700d7d7ULL, 0xc700c7c7c700c7c7ULL,
523 0x3e003e3e3e003e3eULL, 0xf500f5f5f500f5f5ULL, 0x8f008f8f8f008f8fULL,
524 0x6700676767006767ULL, 0x1f001f1f1f001f1fULL, 0x1800181818001818ULL,
525 0x6e006e6e6e006e6eULL, 0xaf00afafaf00afafULL, 0x2f002f2f2f002f2fULL,
526 0xe200e2e2e200e2e2ULL, 0x8500858585008585ULL, 0x0d000d0d0d000d0dULL,
527 0x5300535353005353ULL, 0xf000f0f0f000f0f0ULL, 0x9c009c9c9c009c9cULL,
528 0x6500656565006565ULL, 0xea00eaeaea00eaeaULL, 0xa300a3a3a300a3a3ULL,
529 0xae00aeaeae00aeaeULL, 0x9e009e9e9e009e9eULL, 0xec00ececec00ececULL,
530 0x8000808080008080ULL, 0x2d002d2d2d002d2dULL, 0x6b006b6b6b006b6bULL,
531 0xa800a8a8a800a8a8ULL, 0x2b002b2b2b002b2bULL, 0x3600363636003636ULL,
532 0xa600a6a6a600a6a6ULL, 0xc500c5c5c500c5c5ULL, 0x8600868686008686ULL,
533 0x4d004d4d4d004d4dULL, 0x3300333333003333ULL, 0xfd00fdfdfd00fdfdULL,
534 0x6600666666006666ULL, 0x5800585858005858ULL, 0x9600969696009696ULL,
535 0x3a003a3a3a003a3aULL, 0x0900090909000909ULL, 0x9500959595009595ULL,
536 0x1000101010001010ULL, 0x7800787878007878ULL, 0xd800d8d8d800d8d8ULL,
537 0x4200424242004242ULL, 0xcc00cccccc00ccccULL, 0xef00efefef00efefULL,
538 0x2600262626002626ULL, 0xe500e5e5e500e5e5ULL, 0x6100616161006161ULL,
539 0x1a001a1a1a001a1aULL, 0x3f003f3f3f003f3fULL, 0x3b003b3b3b003b3bULL,
540 0x8200828282008282ULL, 0xb600b6b6b600b6b6ULL, 0xdb00dbdbdb00dbdbULL,
541 0xd400d4d4d400d4d4ULL, 0x9800989898009898ULL, 0xe800e8e8e800e8e8ULL,
542 0x8b008b8b8b008b8bULL, 0x0200020202000202ULL, 0xeb00ebebeb00ebebULL,
543 0x0a000a0a0a000a0aULL, 0x2c002c2c2c002c2cULL, 0x1d001d1d1d001d1dULL,
544 0xb000b0b0b000b0b0ULL, 0x6f006f6f6f006f6fULL, 0x8d008d8d8d008d8dULL,
545 0x8800888888008888ULL, 0x0e000e0e0e000e0eULL, 0x1900191919001919ULL,
546 0x8700878787008787ULL, 0x4e004e4e4e004e4eULL, 0x0b000b0b0b000b0bULL,
547 0xa900a9a9a900a9a9ULL, 0x0c000c0c0c000c0cULL, 0x7900797979007979ULL,
548 0x1100111111001111ULL, 0x7f007f7f7f007f7fULL, 0x2200222222002222ULL,
549 0xe700e7e7e700e7e7ULL, 0x5900595959005959ULL, 0xe100e1e1e100e1e1ULL,
550 0xda00dadada00dadaULL, 0x3d003d3d3d003d3dULL, 0xc800c8c8c800c8c8ULL,
551 0x1200121212001212ULL, 0x0400040404000404ULL, 0x7400747474007474ULL,
552 0x5400545454005454ULL, 0x3000303030003030ULL, 0x7e007e7e7e007e7eULL,
553 0xb400b4b4b400b4b4ULL, 0x2800282828002828ULL, 0x5500555555005555ULL,
554 0x6800686868006868ULL, 0x5000505050005050ULL, 0xbe00bebebe00bebeULL,
555 0xd000d0d0d000d0d0ULL, 0xc400c4c4c400c4c4ULL, 0x3100313131003131ULL,
556 0xcb00cbcbcb00cbcbULL, 0x2a002a2a2a002a2aULL, 0xad00adadad00adadULL,
557 0x0f000f0f0f000f0fULL, 0xca00cacaca00cacaULL, 0x7000707070007070ULL,
558 0xff00ffffff00ffffULL, 0x3200323232003232ULL, 0x6900696969006969ULL,
559 0x0800080808000808ULL, 0x6200626262006262ULL, 0x0000000000000000ULL,
560 0x2400242424002424ULL, 0xd100d1d1d100d1d1ULL, 0xfb00fbfbfb00fbfbULL,
561 0xba00bababa00babaULL, 0xed00ededed00ededULL, 0x4500454545004545ULL,
562 0x8100818181008181ULL, 0x7300737373007373ULL, 0x6d006d6d6d006d6dULL,
563 0x8400848484008484ULL, 0x9f009f9f9f009f9fULL, 0xee00eeeeee00eeeeULL,
564 0x4a004a4a4a004a4aULL, 0xc300c3c3c300c3c3ULL, 0x2e002e2e2e002e2eULL,
565 0xc100c1c1c100c1c1ULL, 0x0100010101000101ULL, 0xe600e6e6e600e6e6ULL,
566 0x2500252525002525ULL, 0x4800484848004848ULL, 0x9900999999009999ULL,
567 0xb900b9b9b900b9b9ULL, 0xb300b3b3b300b3b3ULL, 0x7b007b7b7b007b7bULL,
568 0xf900f9f9f900f9f9ULL, 0xce00cecece00ceceULL, 0xbf00bfbfbf00bfbfULL,
569 0xdf00dfdfdf00dfdfULL, 0x7100717171007171ULL, 0x2900292929002929ULL,
570 0xcd00cdcdcd00cdcdULL, 0x6c006c6c6c006c6cULL, 0x1300131313001313ULL,
571 0x6400646464006464ULL, 0x9b009b9b9b009b9bULL, 0x6300636363006363ULL,
572 0x9d009d9d9d009d9dULL, 0xc000c0c0c000c0c0ULL, 0x4b004b4b4b004b4bULL,
573 0xb700b7b7b700b7b7ULL, 0xa500a5a5a500a5a5ULL, 0x8900898989008989ULL,
574 0x5f005f5f5f005f5fULL, 0xb100b1b1b100b1b1ULL, 0x1700171717001717ULL,
575 0xf400f4f4f400f4f4ULL, 0xbc00bcbcbc00bcbcULL, 0xd300d3d3d300d3d3ULL,
576 0x4600464646004646ULL, 0xcf00cfcfcf00cfcfULL, 0x3700373737003737ULL,
577 0x5e005e5e5e005e5eULL, 0x4700474747004747ULL, 0x9400949494009494ULL,
578 0xfa00fafafa00fafaULL, 0xfc00fcfcfc00fcfcULL, 0x5b005b5b5b005b5bULL,
579 0x9700979797009797ULL, 0xfe00fefefe00fefeULL, 0x5a005a5a5a005a5aULL,
580 0xac00acacac00acacULL, 0x3c003c3c3c003c3cULL, 0x4c004c4c4c004c4cULL,
581 0x0300030303000303ULL, 0x3500353535003535ULL, 0xf300f3f3f300f3f3ULL,
582 0x2300232323002323ULL, 0xb800b8b8b800b8b8ULL, 0x5d005d5d5d005d5dULL,
583 0x6a006a6a6a006a6aULL, 0x9200929292009292ULL, 0xd500d5d5d500d5d5ULL,
584 0x2100212121002121ULL, 0x4400444444004444ULL, 0x5100515151005151ULL,
585 0xc600c6c6c600c6c6ULL, 0x7d007d7d7d007d7dULL, 0x3900393939003939ULL,
586 0x8300838383008383ULL, 0xdc00dcdcdc00dcdcULL, 0xaa00aaaaaa00aaaaULL,
587 0x7c007c7c7c007c7cULL, 0x7700777777007777ULL, 0x5600565656005656ULL,
588 0x0500050505000505ULL, 0x1b001b1b1b001b1bULL, 0xa400a4a4a400a4a4ULL,
589 0x1500151515001515ULL, 0x3400343434003434ULL, 0x1e001e1e1e001e1eULL,
590 0x1c001c1c1c001c1cULL, 0xf800f8f8f800f8f8ULL, 0x5200525252005252ULL,
591 0x2000202020002020ULL, 0x1400141414001414ULL, 0xe900e9e9e900e9e9ULL,
592 0xbd00bdbdbd00bdbdULL, 0xdd00dddddd00ddddULL, 0xe400e4e4e400e4e4ULL,
593 0xa100a1a1a100a1a1ULL, 0xe000e0e0e000e0e0ULL, 0x8a008a8a8a008a8aULL,
594 0xf100f1f1f100f1f1ULL, 0xd600d6d6d600d6d6ULL, 0x7a007a7a7a007a7aULL,
595 0xbb00bbbbbb00bbbbULL, 0xe300e3e3e300e3e3ULL, 0x4000404040004040ULL,
596 0x4f004f4f4f004f4fULL,
597};
598
599const u64 camellia_sp44044404[256] = {
600 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
601 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
602 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
603 0x6b6b006b6b6b006bULL, 0x4545004545450045ULL, 0xa5a500a5a5a500a5ULL,
604 0xeded00ededed00edULL, 0x4f4f004f4f4f004fULL, 0x1d1d001d1d1d001dULL,
605 0x9292009292920092ULL, 0x8686008686860086ULL, 0xafaf00afafaf00afULL,
606 0x7c7c007c7c7c007cULL, 0x1f1f001f1f1f001fULL, 0x3e3e003e3e3e003eULL,
607 0xdcdc00dcdcdc00dcULL, 0x5e5e005e5e5e005eULL, 0x0b0b000b0b0b000bULL,
608 0xa6a600a6a6a600a6ULL, 0x3939003939390039ULL, 0xd5d500d5d5d500d5ULL,
609 0x5d5d005d5d5d005dULL, 0xd9d900d9d9d900d9ULL, 0x5a5a005a5a5a005aULL,
610 0x5151005151510051ULL, 0x6c6c006c6c6c006cULL, 0x8b8b008b8b8b008bULL,
611 0x9a9a009a9a9a009aULL, 0xfbfb00fbfbfb00fbULL, 0xb0b000b0b0b000b0ULL,
612 0x7474007474740074ULL, 0x2b2b002b2b2b002bULL, 0xf0f000f0f0f000f0ULL,
613 0x8484008484840084ULL, 0xdfdf00dfdfdf00dfULL, 0xcbcb00cbcbcb00cbULL,
614 0x3434003434340034ULL, 0x7676007676760076ULL, 0x6d6d006d6d6d006dULL,
615 0xa9a900a9a9a900a9ULL, 0xd1d100d1d1d100d1ULL, 0x0404000404040004ULL,
616 0x1414001414140014ULL, 0x3a3a003a3a3a003aULL, 0xdede00dedede00deULL,
617 0x1111001111110011ULL, 0x3232003232320032ULL, 0x9c9c009c9c9c009cULL,
618 0x5353005353530053ULL, 0xf2f200f2f2f200f2ULL, 0xfefe00fefefe00feULL,
619 0xcfcf00cfcfcf00cfULL, 0xc3c300c3c3c300c3ULL, 0x7a7a007a7a7a007aULL,
620 0x2424002424240024ULL, 0xe8e800e8e8e800e8ULL, 0x6060006060600060ULL,
621 0x6969006969690069ULL, 0xaaaa00aaaaaa00aaULL, 0xa0a000a0a0a000a0ULL,
622 0xa1a100a1a1a100a1ULL, 0x6262006262620062ULL, 0x5454005454540054ULL,
623 0x1e1e001e1e1e001eULL, 0xe0e000e0e0e000e0ULL, 0x6464006464640064ULL,
624 0x1010001010100010ULL, 0x0000000000000000ULL, 0xa3a300a3a3a300a3ULL,
625 0x7575007575750075ULL, 0x8a8a008a8a8a008aULL, 0xe6e600e6e6e600e6ULL,
626 0x0909000909090009ULL, 0xdddd00dddddd00ddULL, 0x8787008787870087ULL,
627 0x8383008383830083ULL, 0xcdcd00cdcdcd00cdULL, 0x9090009090900090ULL,
628 0x7373007373730073ULL, 0xf6f600f6f6f600f6ULL, 0x9d9d009d9d9d009dULL,
629 0xbfbf00bfbfbf00bfULL, 0x5252005252520052ULL, 0xd8d800d8d8d800d8ULL,
630 0xc8c800c8c8c800c8ULL, 0xc6c600c6c6c600c6ULL, 0x8181008181810081ULL,
631 0x6f6f006f6f6f006fULL, 0x1313001313130013ULL, 0x6363006363630063ULL,
632 0xe9e900e9e9e900e9ULL, 0xa7a700a7a7a700a7ULL, 0x9f9f009f9f9f009fULL,
633 0xbcbc00bcbcbc00bcULL, 0x2929002929290029ULL, 0xf9f900f9f9f900f9ULL,
634 0x2f2f002f2f2f002fULL, 0xb4b400b4b4b400b4ULL, 0x7878007878780078ULL,
635 0x0606000606060006ULL, 0xe7e700e7e7e700e7ULL, 0x7171007171710071ULL,
636 0xd4d400d4d4d400d4ULL, 0xabab00ababab00abULL, 0x8888008888880088ULL,
637 0x8d8d008d8d8d008dULL, 0x7272007272720072ULL, 0xb9b900b9b9b900b9ULL,
638 0xf8f800f8f8f800f8ULL, 0xacac00acacac00acULL, 0x3636003636360036ULL,
639 0x2a2a002a2a2a002aULL, 0x3c3c003c3c3c003cULL, 0xf1f100f1f1f100f1ULL,
640 0x4040004040400040ULL, 0xd3d300d3d3d300d3ULL, 0xbbbb00bbbbbb00bbULL,
641 0x4343004343430043ULL, 0x1515001515150015ULL, 0xadad00adadad00adULL,
642 0x7777007777770077ULL, 0x8080008080800080ULL, 0x8282008282820082ULL,
643 0xecec00ececec00ecULL, 0x2727002727270027ULL, 0xe5e500e5e5e500e5ULL,
644 0x8585008585850085ULL, 0x3535003535350035ULL, 0x0c0c000c0c0c000cULL,
645 0x4141004141410041ULL, 0xefef00efefef00efULL, 0x9393009393930093ULL,
646 0x1919001919190019ULL, 0x2121002121210021ULL, 0x0e0e000e0e0e000eULL,
647 0x4e4e004e4e4e004eULL, 0x6565006565650065ULL, 0xbdbd00bdbdbd00bdULL,
648 0xb8b800b8b8b800b8ULL, 0x8f8f008f8f8f008fULL, 0xebeb00ebebeb00ebULL,
649 0xcece00cecece00ceULL, 0x3030003030300030ULL, 0x5f5f005f5f5f005fULL,
650 0xc5c500c5c5c500c5ULL, 0x1a1a001a1a1a001aULL, 0xe1e100e1e1e100e1ULL,
651 0xcaca00cacaca00caULL, 0x4747004747470047ULL, 0x3d3d003d3d3d003dULL,
652 0x0101000101010001ULL, 0xd6d600d6d6d600d6ULL, 0x5656005656560056ULL,
653 0x4d4d004d4d4d004dULL, 0x0d0d000d0d0d000dULL, 0x6666006666660066ULL,
654 0xcccc00cccccc00ccULL, 0x2d2d002d2d2d002dULL, 0x1212001212120012ULL,
655 0x2020002020200020ULL, 0xb1b100b1b1b100b1ULL, 0x9999009999990099ULL,
656 0x4c4c004c4c4c004cULL, 0xc2c200c2c2c200c2ULL, 0x7e7e007e7e7e007eULL,
657 0x0505000505050005ULL, 0xb7b700b7b7b700b7ULL, 0x3131003131310031ULL,
658 0x1717001717170017ULL, 0xd7d700d7d7d700d7ULL, 0x5858005858580058ULL,
659 0x6161006161610061ULL, 0x1b1b001b1b1b001bULL, 0x1c1c001c1c1c001cULL,
660 0x0f0f000f0f0f000fULL, 0x1616001616160016ULL, 0x1818001818180018ULL,
661 0x2222002222220022ULL, 0x4444004444440044ULL, 0xb2b200b2b2b200b2ULL,
662 0xb5b500b5b5b500b5ULL, 0x9191009191910091ULL, 0x0808000808080008ULL,
663 0xa8a800a8a8a800a8ULL, 0xfcfc00fcfcfc00fcULL, 0x5050005050500050ULL,
664 0xd0d000d0d0d000d0ULL, 0x7d7d007d7d7d007dULL, 0x8989008989890089ULL,
665 0x9797009797970097ULL, 0x5b5b005b5b5b005bULL, 0x9595009595950095ULL,
666 0xffff00ffffff00ffULL, 0xd2d200d2d2d200d2ULL, 0xc4c400c4c4c400c4ULL,
667 0x4848004848480048ULL, 0xf7f700f7f7f700f7ULL, 0xdbdb00dbdbdb00dbULL,
668 0x0303000303030003ULL, 0xdada00dadada00daULL, 0x3f3f003f3f3f003fULL,
669 0x9494009494940094ULL, 0x5c5c005c5c5c005cULL, 0x0202000202020002ULL,
670 0x4a4a004a4a4a004aULL, 0x3333003333330033ULL, 0x6767006767670067ULL,
671 0xf3f300f3f3f300f3ULL, 0x7f7f007f7f7f007fULL, 0xe2e200e2e2e200e2ULL,
672 0x9b9b009b9b9b009bULL, 0x2626002626260026ULL, 0x3737003737370037ULL,
673 0x3b3b003b3b3b003bULL, 0x9696009696960096ULL, 0x4b4b004b4b4b004bULL,
674 0xbebe00bebebe00beULL, 0x2e2e002e2e2e002eULL, 0x7979007979790079ULL,
675 0x8c8c008c8c8c008cULL, 0x6e6e006e6e6e006eULL, 0x8e8e008e8e8e008eULL,
676 0xf5f500f5f5f500f5ULL, 0xb6b600b6b6b600b6ULL, 0xfdfd00fdfdfd00fdULL,
677 0x5959005959590059ULL, 0x9898009898980098ULL, 0x6a6a006a6a6a006aULL,
678 0x4646004646460046ULL, 0xbaba00bababa00baULL, 0x2525002525250025ULL,
679 0x4242004242420042ULL, 0xa2a200a2a2a200a2ULL, 0xfafa00fafafa00faULL,
680 0x0707000707070007ULL, 0x5555005555550055ULL, 0xeeee00eeeeee00eeULL,
681 0x0a0a000a0a0a000aULL, 0x4949004949490049ULL, 0x6868006868680068ULL,
682 0x3838003838380038ULL, 0xa4a400a4a4a400a4ULL, 0x2828002828280028ULL,
683 0x7b7b007b7b7b007bULL, 0xc9c900c9c9c900c9ULL, 0xc1c100c1c1c100c1ULL,
684 0xe3e300e3e3e300e3ULL, 0xf4f400f4f4f400f4ULL, 0xc7c700c7c7c700c7ULL,
685 0x9e9e009e9e9e009eULL,
686};
687
688const u64 camellia_sp11101110[256] = {
689 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
690 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
691 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
692 0x8585850085858500ULL, 0x5757570057575700ULL, 0x3535350035353500ULL,
693 0xeaeaea00eaeaea00ULL, 0x0c0c0c000c0c0c00ULL, 0xaeaeae00aeaeae00ULL,
694 0x4141410041414100ULL, 0x2323230023232300ULL, 0xefefef00efefef00ULL,
695 0x6b6b6b006b6b6b00ULL, 0x9393930093939300ULL, 0x4545450045454500ULL,
696 0x1919190019191900ULL, 0xa5a5a500a5a5a500ULL, 0x2121210021212100ULL,
697 0xededed00ededed00ULL, 0x0e0e0e000e0e0e00ULL, 0x4f4f4f004f4f4f00ULL,
698 0x4e4e4e004e4e4e00ULL, 0x1d1d1d001d1d1d00ULL, 0x6565650065656500ULL,
699 0x9292920092929200ULL, 0xbdbdbd00bdbdbd00ULL, 0x8686860086868600ULL,
700 0xb8b8b800b8b8b800ULL, 0xafafaf00afafaf00ULL, 0x8f8f8f008f8f8f00ULL,
701 0x7c7c7c007c7c7c00ULL, 0xebebeb00ebebeb00ULL, 0x1f1f1f001f1f1f00ULL,
702 0xcecece00cecece00ULL, 0x3e3e3e003e3e3e00ULL, 0x3030300030303000ULL,
703 0xdcdcdc00dcdcdc00ULL, 0x5f5f5f005f5f5f00ULL, 0x5e5e5e005e5e5e00ULL,
704 0xc5c5c500c5c5c500ULL, 0x0b0b0b000b0b0b00ULL, 0x1a1a1a001a1a1a00ULL,
705 0xa6a6a600a6a6a600ULL, 0xe1e1e100e1e1e100ULL, 0x3939390039393900ULL,
706 0xcacaca00cacaca00ULL, 0xd5d5d500d5d5d500ULL, 0x4747470047474700ULL,
707 0x5d5d5d005d5d5d00ULL, 0x3d3d3d003d3d3d00ULL, 0xd9d9d900d9d9d900ULL,
708 0x0101010001010100ULL, 0x5a5a5a005a5a5a00ULL, 0xd6d6d600d6d6d600ULL,
709 0x5151510051515100ULL, 0x5656560056565600ULL, 0x6c6c6c006c6c6c00ULL,
710 0x4d4d4d004d4d4d00ULL, 0x8b8b8b008b8b8b00ULL, 0x0d0d0d000d0d0d00ULL,
711 0x9a9a9a009a9a9a00ULL, 0x6666660066666600ULL, 0xfbfbfb00fbfbfb00ULL,
712 0xcccccc00cccccc00ULL, 0xb0b0b000b0b0b000ULL, 0x2d2d2d002d2d2d00ULL,
713 0x7474740074747400ULL, 0x1212120012121200ULL, 0x2b2b2b002b2b2b00ULL,
714 0x2020200020202000ULL, 0xf0f0f000f0f0f000ULL, 0xb1b1b100b1b1b100ULL,
715 0x8484840084848400ULL, 0x9999990099999900ULL, 0xdfdfdf00dfdfdf00ULL,
716 0x4c4c4c004c4c4c00ULL, 0xcbcbcb00cbcbcb00ULL, 0xc2c2c200c2c2c200ULL,
717 0x3434340034343400ULL, 0x7e7e7e007e7e7e00ULL, 0x7676760076767600ULL,
718 0x0505050005050500ULL, 0x6d6d6d006d6d6d00ULL, 0xb7b7b700b7b7b700ULL,
719 0xa9a9a900a9a9a900ULL, 0x3131310031313100ULL, 0xd1d1d100d1d1d100ULL,
720 0x1717170017171700ULL, 0x0404040004040400ULL, 0xd7d7d700d7d7d700ULL,
721 0x1414140014141400ULL, 0x5858580058585800ULL, 0x3a3a3a003a3a3a00ULL,
722 0x6161610061616100ULL, 0xdedede00dedede00ULL, 0x1b1b1b001b1b1b00ULL,
723 0x1111110011111100ULL, 0x1c1c1c001c1c1c00ULL, 0x3232320032323200ULL,
724 0x0f0f0f000f0f0f00ULL, 0x9c9c9c009c9c9c00ULL, 0x1616160016161600ULL,
725 0x5353530053535300ULL, 0x1818180018181800ULL, 0xf2f2f200f2f2f200ULL,
726 0x2222220022222200ULL, 0xfefefe00fefefe00ULL, 0x4444440044444400ULL,
727 0xcfcfcf00cfcfcf00ULL, 0xb2b2b200b2b2b200ULL, 0xc3c3c300c3c3c300ULL,
728 0xb5b5b500b5b5b500ULL, 0x7a7a7a007a7a7a00ULL, 0x9191910091919100ULL,
729 0x2424240024242400ULL, 0x0808080008080800ULL, 0xe8e8e800e8e8e800ULL,
730 0xa8a8a800a8a8a800ULL, 0x6060600060606000ULL, 0xfcfcfc00fcfcfc00ULL,
731 0x6969690069696900ULL, 0x5050500050505000ULL, 0xaaaaaa00aaaaaa00ULL,
732 0xd0d0d000d0d0d000ULL, 0xa0a0a000a0a0a000ULL, 0x7d7d7d007d7d7d00ULL,
733 0xa1a1a100a1a1a100ULL, 0x8989890089898900ULL, 0x6262620062626200ULL,
734 0x9797970097979700ULL, 0x5454540054545400ULL, 0x5b5b5b005b5b5b00ULL,
735 0x1e1e1e001e1e1e00ULL, 0x9595950095959500ULL, 0xe0e0e000e0e0e000ULL,
736 0xffffff00ffffff00ULL, 0x6464640064646400ULL, 0xd2d2d200d2d2d200ULL,
737 0x1010100010101000ULL, 0xc4c4c400c4c4c400ULL, 0x0000000000000000ULL,
738 0x4848480048484800ULL, 0xa3a3a300a3a3a300ULL, 0xf7f7f700f7f7f700ULL,
739 0x7575750075757500ULL, 0xdbdbdb00dbdbdb00ULL, 0x8a8a8a008a8a8a00ULL,
740 0x0303030003030300ULL, 0xe6e6e600e6e6e600ULL, 0xdadada00dadada00ULL,
741 0x0909090009090900ULL, 0x3f3f3f003f3f3f00ULL, 0xdddddd00dddddd00ULL,
742 0x9494940094949400ULL, 0x8787870087878700ULL, 0x5c5c5c005c5c5c00ULL,
743 0x8383830083838300ULL, 0x0202020002020200ULL, 0xcdcdcd00cdcdcd00ULL,
744 0x4a4a4a004a4a4a00ULL, 0x9090900090909000ULL, 0x3333330033333300ULL,
745 0x7373730073737300ULL, 0x6767670067676700ULL, 0xf6f6f600f6f6f600ULL,
746 0xf3f3f300f3f3f300ULL, 0x9d9d9d009d9d9d00ULL, 0x7f7f7f007f7f7f00ULL,
747 0xbfbfbf00bfbfbf00ULL, 0xe2e2e200e2e2e200ULL, 0x5252520052525200ULL,
748 0x9b9b9b009b9b9b00ULL, 0xd8d8d800d8d8d800ULL, 0x2626260026262600ULL,
749 0xc8c8c800c8c8c800ULL, 0x3737370037373700ULL, 0xc6c6c600c6c6c600ULL,
750 0x3b3b3b003b3b3b00ULL, 0x8181810081818100ULL, 0x9696960096969600ULL,
751 0x6f6f6f006f6f6f00ULL, 0x4b4b4b004b4b4b00ULL, 0x1313130013131300ULL,
752 0xbebebe00bebebe00ULL, 0x6363630063636300ULL, 0x2e2e2e002e2e2e00ULL,
753 0xe9e9e900e9e9e900ULL, 0x7979790079797900ULL, 0xa7a7a700a7a7a700ULL,
754 0x8c8c8c008c8c8c00ULL, 0x9f9f9f009f9f9f00ULL, 0x6e6e6e006e6e6e00ULL,
755 0xbcbcbc00bcbcbc00ULL, 0x8e8e8e008e8e8e00ULL, 0x2929290029292900ULL,
756 0xf5f5f500f5f5f500ULL, 0xf9f9f900f9f9f900ULL, 0xb6b6b600b6b6b600ULL,
757 0x2f2f2f002f2f2f00ULL, 0xfdfdfd00fdfdfd00ULL, 0xb4b4b400b4b4b400ULL,
758 0x5959590059595900ULL, 0x7878780078787800ULL, 0x9898980098989800ULL,
759 0x0606060006060600ULL, 0x6a6a6a006a6a6a00ULL, 0xe7e7e700e7e7e700ULL,
760 0x4646460046464600ULL, 0x7171710071717100ULL, 0xbababa00bababa00ULL,
761 0xd4d4d400d4d4d400ULL, 0x2525250025252500ULL, 0xababab00ababab00ULL,
762 0x4242420042424200ULL, 0x8888880088888800ULL, 0xa2a2a200a2a2a200ULL,
763 0x8d8d8d008d8d8d00ULL, 0xfafafa00fafafa00ULL, 0x7272720072727200ULL,
764 0x0707070007070700ULL, 0xb9b9b900b9b9b900ULL, 0x5555550055555500ULL,
765 0xf8f8f800f8f8f800ULL, 0xeeeeee00eeeeee00ULL, 0xacacac00acacac00ULL,
766 0x0a0a0a000a0a0a00ULL, 0x3636360036363600ULL, 0x4949490049494900ULL,
767 0x2a2a2a002a2a2a00ULL, 0x6868680068686800ULL, 0x3c3c3c003c3c3c00ULL,
768 0x3838380038383800ULL, 0xf1f1f100f1f1f100ULL, 0xa4a4a400a4a4a400ULL,
769 0x4040400040404000ULL, 0x2828280028282800ULL, 0xd3d3d300d3d3d300ULL,
770 0x7b7b7b007b7b7b00ULL, 0xbbbbbb00bbbbbb00ULL, 0xc9c9c900c9c9c900ULL,
771 0x4343430043434300ULL, 0xc1c1c100c1c1c100ULL, 0x1515150015151500ULL,
772 0xe3e3e300e3e3e300ULL, 0xadadad00adadad00ULL, 0xf4f4f400f4f4f400ULL,
773 0x7777770077777700ULL, 0xc7c7c700c7c7c700ULL, 0x8080800080808000ULL,
774 0x9e9e9e009e9e9e00ULL,
775};
776
777/* key constants */
778#define CAMELLIA_SIGMA1L (0xA09E667FL)
779#define CAMELLIA_SIGMA1R (0x3BCC908BL)
780#define CAMELLIA_SIGMA2L (0xB67AE858L)
781#define CAMELLIA_SIGMA2R (0x4CAA73B2L)
782#define CAMELLIA_SIGMA3L (0xC6EF372FL)
783#define CAMELLIA_SIGMA3R (0xE94F82BEL)
784#define CAMELLIA_SIGMA4L (0x54FF53A5L)
785#define CAMELLIA_SIGMA4R (0xF1D36F1CL)
786#define CAMELLIA_SIGMA5L (0x10E527FAL)
787#define CAMELLIA_SIGMA5R (0xDE682D1DL)
788#define CAMELLIA_SIGMA6L (0xB05688C2L)
789#define CAMELLIA_SIGMA6R (0xB3E6C1FDL)
790
791/* macros */
792#define ROLDQ(l, r, bits) ({ \
793 u64 t = l; \
794 l = (l << bits) | (r >> (64 - bits)); \
795 r = (r << bits) | (t >> (64 - bits)); \
796})
797
798#define CAMELLIA_F(x, kl, kr, y) ({ \
799 u64 ii = x ^ (((u64)kl << 32) | kr); \
800 y = camellia_sp11101110[(uint8_t)ii]; \
801 y ^= camellia_sp44044404[(uint8_t)(ii >> 8)]; \
802 ii >>= 16; \
803 y ^= camellia_sp30333033[(uint8_t)ii]; \
804 y ^= camellia_sp02220222[(uint8_t)(ii >> 8)]; \
805 ii >>= 16; \
806 y ^= camellia_sp00444404[(uint8_t)ii]; \
807 y ^= camellia_sp03303033[(uint8_t)(ii >> 8)]; \
808 ii >>= 16; \
809 y ^= camellia_sp22000222[(uint8_t)ii]; \
810 y ^= camellia_sp10011110[(uint8_t)(ii >> 8)]; \
811 y = ror64(y, 32); \
812})
813
814#define SET_SUBKEY_LR(INDEX, sRL) (subkey[(INDEX)] = ror64((sRL), 32))
815
816static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
817{
818 u64 kw4, tt;
819 u32 dw, tl, tr;
820
821 /* absorb kw2 to other subkeys */
822 /* round 2 */
823 subRL[3] ^= subRL[1];
824 /* round 4 */
825 subRL[5] ^= subRL[1];
826 /* round 6 */
827 subRL[7] ^= subRL[1];
828
829 subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
830 /* modified for FLinv(kl2) */
831 dw = (subRL[1] & subRL[9]) >> 32,
832 subRL[1] ^= rol32(dw, 1);
833
834 /* round 8 */
835 subRL[11] ^= subRL[1];
836 /* round 10 */
837 subRL[13] ^= subRL[1];
838 /* round 12 */
839 subRL[15] ^= subRL[1];
840
841 subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
842 /* modified for FLinv(kl4) */
843 dw = (subRL[1] & subRL[17]) >> 32,
844 subRL[1] ^= rol32(dw, 1);
845
846 /* round 14 */
847 subRL[19] ^= subRL[1];
848 /* round 16 */
849 subRL[21] ^= subRL[1];
850 /* round 18 */
851 subRL[23] ^= subRL[1];
852
853 if (max == 24) {
854 /* kw3 */
855 subRL[24] ^= subRL[1];
856
857 /* absorb kw4 to other subkeys */
858 kw4 = subRL[25];
859 } else {
860 subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
861 /* modified for FLinv(kl6) */
862 dw = (subRL[1] & subRL[25]) >> 32,
863 subRL[1] ^= rol32(dw, 1);
864
865 /* round 20 */
866 subRL[27] ^= subRL[1];
867 /* round 22 */
868 subRL[29] ^= subRL[1];
869 /* round 24 */
870 subRL[31] ^= subRL[1];
871 /* kw3 */
872 subRL[32] ^= subRL[1];
873
874 /* absorb kw4 to other subkeys */
875 kw4 = subRL[33];
876 /* round 23 */
877 subRL[30] ^= kw4;
878 /* round 21 */
879 subRL[28] ^= kw4;
880 /* round 19 */
881 subRL[26] ^= kw4;
882
883 kw4 ^= (kw4 & ~subRL[24]) << 32;
884 /* modified for FL(kl5) */
885 dw = (kw4 & subRL[24]) >> 32,
886 kw4 ^= rol32(dw, 1);
887 }
888
889 /* round 17 */
890 subRL[22] ^= kw4;
891 /* round 15 */
892 subRL[20] ^= kw4;
893 /* round 13 */
894 subRL[18] ^= kw4;
895
896 kw4 ^= (kw4 & ~subRL[16]) << 32;
897 /* modified for FL(kl3) */
898 dw = (kw4 & subRL[16]) >> 32,
899 kw4 ^= rol32(dw, 1);
900
901 /* round 11 */
902 subRL[14] ^= kw4;
903 /* round 9 */
904 subRL[12] ^= kw4;
905 /* round 7 */
906 subRL[10] ^= kw4;
907
908 kw4 ^= (kw4 & ~subRL[8]) << 32;
909 /* modified for FL(kl1) */
910 dw = (kw4 & subRL[8]) >> 32,
911 kw4 ^= rol32(dw, 1);
912
913 /* round 5 */
914 subRL[6] ^= kw4;
915 /* round 3 */
916 subRL[4] ^= kw4;
917 /* round 1 */
918 subRL[2] ^= kw4;
919 /* kw1 */
920 subRL[0] ^= kw4;
921
922 /* key XOR is end of F-function */
923 SET_SUBKEY_LR(0, subRL[0] ^ subRL[2]); /* kw1 */
924 SET_SUBKEY_LR(2, subRL[3]); /* round 1 */
925 SET_SUBKEY_LR(3, subRL[2] ^ subRL[4]); /* round 2 */
926 SET_SUBKEY_LR(4, subRL[3] ^ subRL[5]); /* round 3 */
927 SET_SUBKEY_LR(5, subRL[4] ^ subRL[6]); /* round 4 */
928 SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */
929
930 tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
931 dw = tl & (subRL[8] >> 32), /* FL(kl1) */
932 tr = subRL[10] ^ rol32(dw, 1);
933 tt = (tr | ((u64)tl << 32));
934
935 SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */
936 SET_SUBKEY_LR(8, subRL[8]); /* FL(kl1) */
937 SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */
938
939 tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
940 dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */
941 tr = subRL[7] ^ rol32(dw, 1);
942 tt = (tr | ((u64)tl << 32));
943
944 SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */
945 SET_SUBKEY_LR(11, subRL[10] ^ subRL[12]); /* round 8 */
946 SET_SUBKEY_LR(12, subRL[11] ^ subRL[13]); /* round 9 */
947 SET_SUBKEY_LR(13, subRL[12] ^ subRL[14]); /* round 10 */
948 SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */
949
950 tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
951 dw = tl & (subRL[16] >> 32), /* FL(kl3) */
952 tr = subRL[18] ^ rol32(dw, 1);
953 tt = (tr | ((u64)tl << 32));
954
955 SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */
956 SET_SUBKEY_LR(16, subRL[16]); /* FL(kl3) */
957 SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */
958
959 tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
960 dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */
961 tr = subRL[15] ^ rol32(dw, 1);
962 tt = (tr | ((u64)tl << 32));
963
964 SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */
965 SET_SUBKEY_LR(19, subRL[18] ^ subRL[20]); /* round 14 */
966 SET_SUBKEY_LR(20, subRL[19] ^ subRL[21]); /* round 15 */
967 SET_SUBKEY_LR(21, subRL[20] ^ subRL[22]); /* round 16 */
968 SET_SUBKEY_LR(22, subRL[21] ^ subRL[23]); /* round 17 */
969
970 if (max == 24) {
971 SET_SUBKEY_LR(23, subRL[22]); /* round 18 */
972 SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */
973 } else {
974 tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
975 dw = tl & (subRL[24] >> 32), /* FL(kl5) */
976 tr = subRL[26] ^ rol32(dw, 1);
977 tt = (tr | ((u64)tl << 32));
978
979 SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */
980 SET_SUBKEY_LR(24, subRL[24]); /* FL(kl5) */
981 SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */
982
983 tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
984 dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */
985 tr = subRL[23] ^ rol32(dw, 1);
986 tt = (tr | ((u64)tl << 32));
987
988 SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */
989 SET_SUBKEY_LR(27, subRL[26] ^ subRL[28]); /* round 20 */
990 SET_SUBKEY_LR(28, subRL[27] ^ subRL[29]); /* round 21 */
991 SET_SUBKEY_LR(29, subRL[28] ^ subRL[30]); /* round 22 */
992 SET_SUBKEY_LR(30, subRL[29] ^ subRL[31]); /* round 23 */
993 SET_SUBKEY_LR(31, subRL[30]); /* round 24 */
994 SET_SUBKEY_LR(32, subRL[32] ^ subRL[31]); /* kw3 */
995 }
996}
997
998static void camellia_setup128(const unsigned char *key, u64 *subkey)
999{
1000 u64 kl, kr, ww;
1001 u64 subRL[26];
1002
1003 /**
1004 * k == kl || kr (|| is concatenation)
1005 */
1006 kl = get_unaligned_be64(key);
1007 kr = get_unaligned_be64(key + 8);
1008
1009 /* generate KL dependent subkeys */
1010 /* kw1 */
1011 subRL[0] = kl;
1012 /* kw2 */
1013 subRL[1] = kr;
1014
1015 /* rotation left shift 15bit */
1016 ROLDQ(kl, kr, 15);
1017
1018 /* k3 */
1019 subRL[4] = kl;
1020 /* k4 */
1021 subRL[5] = kr;
1022
1023 /* rotation left shift 15+30bit */
1024 ROLDQ(kl, kr, 30);
1025
1026 /* k7 */
1027 subRL[10] = kl;
1028 /* k8 */
1029 subRL[11] = kr;
1030
1031 /* rotation left shift 15+30+15bit */
1032 ROLDQ(kl, kr, 15);
1033
1034 /* k10 */
1035 subRL[13] = kr;
1036 /* rotation left shift 15+30+15+17 bit */
1037 ROLDQ(kl, kr, 17);
1038
1039 /* kl3 */
1040 subRL[16] = kl;
1041 /* kl4 */
1042 subRL[17] = kr;
1043
1044 /* rotation left shift 15+30+15+17+17 bit */
1045 ROLDQ(kl, kr, 17);
1046
1047 /* k13 */
1048 subRL[18] = kl;
1049 /* k14 */
1050 subRL[19] = kr;
1051
1052 /* rotation left shift 15+30+15+17+17+17 bit */
1053 ROLDQ(kl, kr, 17);
1054
1055 /* k17 */
1056 subRL[22] = kl;
1057 /* k18 */
1058 subRL[23] = kr;
1059
1060 /* generate KA */
1061 kl = subRL[0];
1062 kr = subRL[1];
1063 CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
1064 kr ^= ww;
1065 CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
1066
1067 /* current status == (kll, klr, w0, w1) */
1068 CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
1069 kr ^= ww;
1070 CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
1071 kl ^= ww;
1072
1073 /* generate KA dependent subkeys */
1074 /* k1, k2 */
1075 subRL[2] = kl;
1076 subRL[3] = kr;
1077 ROLDQ(kl, kr, 15);
1078 /* k5,k6 */
1079 subRL[6] = kl;
1080 subRL[7] = kr;
1081 ROLDQ(kl, kr, 15);
1082 /* kl1, kl2 */
1083 subRL[8] = kl;
1084 subRL[9] = kr;
1085 ROLDQ(kl, kr, 15);
1086 /* k9 */
1087 subRL[12] = kl;
1088 ROLDQ(kl, kr, 15);
1089 /* k11, k12 */
1090 subRL[14] = kl;
1091 subRL[15] = kr;
1092 ROLDQ(kl, kr, 34);
1093 /* k15, k16 */
1094 subRL[20] = kl;
1095 subRL[21] = kr;
1096 ROLDQ(kl, kr, 17);
1097 /* kw3, kw4 */
1098 subRL[24] = kl;
1099 subRL[25] = kr;
1100
1101 camellia_setup_tail(subkey, subRL, 24);
1102}
1103
1104static void camellia_setup256(const unsigned char *key, u64 *subkey)
1105{
1106 u64 kl, kr; /* left half of key */
1107 u64 krl, krr; /* right half of key */
1108 u64 ww; /* temporary variables */
1109 u64 subRL[34];
1110
1111 /**
1112 * key = (kl || kr || krl || krr) (|| is concatenation)
1113 */
1114 kl = get_unaligned_be64(key);
1115 kr = get_unaligned_be64(key + 8);
1116 krl = get_unaligned_be64(key + 16);
1117 krr = get_unaligned_be64(key + 24);
1118
1119 /* generate KL dependent subkeys */
1120 /* kw1 */
1121 subRL[0] = kl;
1122 /* kw2 */
1123 subRL[1] = kr;
1124 ROLDQ(kl, kr, 45);
1125 /* k9 */
1126 subRL[12] = kl;
1127 /* k10 */
1128 subRL[13] = kr;
1129 ROLDQ(kl, kr, 15);
1130 /* kl3 */
1131 subRL[16] = kl;
1132 /* kl4 */
1133 subRL[17] = kr;
1134 ROLDQ(kl, kr, 17);
1135 /* k17 */
1136 subRL[22] = kl;
1137 /* k18 */
1138 subRL[23] = kr;
1139 ROLDQ(kl, kr, 34);
1140 /* k23 */
1141 subRL[30] = kl;
1142 /* k24 */
1143 subRL[31] = kr;
1144
1145 /* generate KR dependent subkeys */
1146 ROLDQ(krl, krr, 15);
1147 /* k3 */
1148 subRL[4] = krl;
1149 /* k4 */
1150 subRL[5] = krr;
1151 ROLDQ(krl, krr, 15);
1152 /* kl1 */
1153 subRL[8] = krl;
1154 /* kl2 */
1155 subRL[9] = krr;
1156 ROLDQ(krl, krr, 30);
1157 /* k13 */
1158 subRL[18] = krl;
1159 /* k14 */
1160 subRL[19] = krr;
1161 ROLDQ(krl, krr, 34);
1162 /* k19 */
1163 subRL[26] = krl;
1164 /* k20 */
1165 subRL[27] = krr;
1166 ROLDQ(krl, krr, 34);
1167
1168 /* generate KA */
1169 kl = subRL[0] ^ krl;
1170 kr = subRL[1] ^ krr;
1171
1172 CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
1173 kr ^= ww;
1174 CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
1175 kl ^= krl;
1176 CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
1177 kr ^= ww ^ krr;
1178 CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
1179 kl ^= ww;
1180
1181 /* generate KB */
1182 krl ^= kl;
1183 krr ^= kr;
1184 CAMELLIA_F(krl, CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R, ww);
1185 krr ^= ww;
1186 CAMELLIA_F(krr, CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R, ww);
1187 krl ^= ww;
1188
1189 /* generate KA dependent subkeys */
1190 ROLDQ(kl, kr, 15);
1191 /* k5 */
1192 subRL[6] = kl;
1193 /* k6 */
1194 subRL[7] = kr;
1195 ROLDQ(kl, kr, 30);
1196 /* k11 */
1197 subRL[14] = kl;
1198 /* k12 */
1199 subRL[15] = kr;
1200 /* rotation left shift 32bit */
1201 ROLDQ(kl, kr, 32);
1202 /* kl5 */
1203 subRL[24] = kl;
1204 /* kl6 */
1205 subRL[25] = kr;
1206 /* rotation left shift 17 from k11,k12 -> k21,k22 */
1207 ROLDQ(kl, kr, 17);
1208 /* k21 */
1209 subRL[28] = kl;
1210 /* k22 */
1211 subRL[29] = kr;
1212
1213 /* generate KB dependent subkeys */
1214 /* k1 */
1215 subRL[2] = krl;
1216 /* k2 */
1217 subRL[3] = krr;
1218 ROLDQ(krl, krr, 30);
1219 /* k7 */
1220 subRL[10] = krl;
1221 /* k8 */
1222 subRL[11] = krr;
1223 ROLDQ(krl, krr, 30);
1224 /* k15 */
1225 subRL[20] = krl;
1226 /* k16 */
1227 subRL[21] = krr;
1228 ROLDQ(krl, krr, 51);
1229 /* kw3 */
1230 subRL[32] = krl;
1231 /* kw4 */
1232 subRL[33] = krr;
1233
1234 camellia_setup_tail(subkey, subRL, 32);
1235}
1236
1237static void camellia_setup192(const unsigned char *key, u64 *subkey)
1238{
1239 unsigned char kk[32];
1240 u64 krl, krr;
1241
1242 memcpy(kk, key, 24);
1243 memcpy((unsigned char *)&krl, key+16, 8);
1244 krr = ~krl;
1245 memcpy(kk+24, (unsigned char *)&krr, 8);
1246 camellia_setup256(kk, subkey);
1247}
1248
1249int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
1250 unsigned int key_len, u32 *flags)
1251{
1252 if (key_len != 16 && key_len != 24 && key_len != 32) {
1253 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
1254 return -EINVAL;
1255 }
1256
1257 cctx->key_length = key_len;
1258
1259 switch (key_len) {
1260 case 16:
1261 camellia_setup128(key, cctx->key_table);
1262 break;
1263 case 24:
1264 camellia_setup192(key, cctx->key_table);
1265 break;
1266 case 32:
1267 camellia_setup256(key, cctx->key_table);
1268 break;
1269 }
1270
1271 return 0;
1272}
1273EXPORT_SYMBOL_GPL(__camellia_setkey);
1274
1275static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
1276 unsigned int key_len)
1277{
1278 return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
1279 &tfm->crt_flags);
1280}
1281
1282void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
1283{
1284 u128 iv = *src;
1285
1286 camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
1287
1288 u128_xor(&dst[1], &dst[1], &iv);
1289}
1290EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
1291
1292void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
1293{
1294 be128 ctrblk;
1295
1296 if (dst != src)
1297 *dst = *src;
1298
1299 le128_to_be128(&ctrblk, iv);
1300 le128_inc(iv);
1301
1302 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
1303}
1304EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
1305
1306void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
1307{
1308 be128 ctrblks[2];
1309
1310 if (dst != src) {
1311 dst[0] = src[0];
1312 dst[1] = src[1];
1313 }
1314
1315 le128_to_be128(&ctrblks[0], iv);
1316 le128_inc(iv);
1317 le128_to_be128(&ctrblks[1], iv);
1318 le128_inc(iv);
1319
1320 camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
1321}
1322EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
1323
1324static const struct common_glue_ctx camellia_enc = {
1325 .num_funcs = 2,
1326 .fpu_blocks_limit = -1,
1327
1328 .funcs = { {
1329 .num_blocks = 2,
1330 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
1331 }, {
1332 .num_blocks = 1,
1333 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
1334 } }
1335};
1336
1337static const struct common_glue_ctx camellia_ctr = {
1338 .num_funcs = 2,
1339 .fpu_blocks_limit = -1,
1340
1341 .funcs = { {
1342 .num_blocks = 2,
1343 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
1344 }, {
1345 .num_blocks = 1,
1346 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
1347 } }
1348};
1349
1350static const struct common_glue_ctx camellia_dec = {
1351 .num_funcs = 2,
1352 .fpu_blocks_limit = -1,
1353
1354 .funcs = { {
1355 .num_blocks = 2,
1356 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
1357 }, {
1358 .num_blocks = 1,
1359 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
1360 } }
1361};
1362
1363static const struct common_glue_ctx camellia_dec_cbc = {
1364 .num_funcs = 2,
1365 .fpu_blocks_limit = -1,
1366
1367 .funcs = { {
1368 .num_blocks = 2,
1369 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
1370 }, {
1371 .num_blocks = 1,
1372 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
1373 } }
1374};
1375
1376static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1377 struct scatterlist *src, unsigned int nbytes)
1378{
1379 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
1380}
1381
1382static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1383 struct scatterlist *src, unsigned int nbytes)
1384{
1385 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
1386}
1387
1388static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1389 struct scatterlist *src, unsigned int nbytes)
1390{
1391 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
1392 dst, src, nbytes);
1393}
1394
1395static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1396 struct scatterlist *src, unsigned int nbytes)
1397{
1398 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
1399 nbytes);
1400}
1401
1402static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1403 struct scatterlist *src, unsigned int nbytes)
1404{
1405 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
1406}
1407
1408static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1409{
1410 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1411 struct camellia_ctx *ctx = priv;
1412 int i;
1413
1414 while (nbytes >= 2 * bsize) {
1415 camellia_enc_blk_2way(ctx, srcdst, srcdst);
1416 srcdst += bsize * 2;
1417 nbytes -= bsize * 2;
1418 }
1419
1420 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1421 camellia_enc_blk(ctx, srcdst, srcdst);
1422}
1423
1424static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1425{
1426 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
1427 struct camellia_ctx *ctx = priv;
1428 int i;
1429
1430 while (nbytes >= 2 * bsize) {
1431 camellia_dec_blk_2way(ctx, srcdst, srcdst);
1432 srcdst += bsize * 2;
1433 nbytes -= bsize * 2;
1434 }
1435
1436 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
1437 camellia_dec_blk(ctx, srcdst, srcdst);
1438}
1439
1440int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1441 unsigned int keylen)
1442{
1443 struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
1444 int err;
1445
1446 err = __camellia_setkey(&ctx->camellia_ctx, key,
1447 keylen - CAMELLIA_BLOCK_SIZE,
1448 &tfm->crt_flags);
1449 if (err)
1450 return err;
1451
1452 return lrw_init_table(&ctx->lrw_table,
1453 key + keylen - CAMELLIA_BLOCK_SIZE);
1454}
1455EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
1456
1457static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1458 struct scatterlist *src, unsigned int nbytes)
1459{
1460 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1461 be128 buf[2 * 4];
1462 struct lrw_crypt_req req = {
1463 .tbuf = buf,
1464 .tbuflen = sizeof(buf),
1465
1466 .table_ctx = &ctx->lrw_table,
1467 .crypt_ctx = &ctx->camellia_ctx,
1468 .crypt_fn = encrypt_callback,
1469 };
1470
1471 return lrw_crypt(desc, dst, src, nbytes, &req);
1472}
1473
1474static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1475 struct scatterlist *src, unsigned int nbytes)
1476{
1477 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1478 be128 buf[2 * 4];
1479 struct lrw_crypt_req req = {
1480 .tbuf = buf,
1481 .tbuflen = sizeof(buf),
1482
1483 .table_ctx = &ctx->lrw_table,
1484 .crypt_ctx = &ctx->camellia_ctx,
1485 .crypt_fn = decrypt_callback,
1486 };
1487
1488 return lrw_crypt(desc, dst, src, nbytes, &req);
1489}
1490
1491void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
1492{
1493 struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
1494
1495 lrw_free_table(&ctx->lrw_table);
1496}
1497EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
1498
1499int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1500 unsigned int keylen)
1501{
1502 struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
1503 u32 *flags = &tfm->crt_flags;
1504 int err;
1505
1506 /* key consists of keys of equal size concatenated, therefore
1507 * the length must be even
1508 */
1509 if (keylen % 2) {
1510 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
1511 return -EINVAL;
1512 }
1513
1514 /* first half of xts-key is for crypt */
1515 err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
1516 if (err)
1517 return err;
1518
1519 /* second half of xts-key is for tweak */
1520 return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
1521 flags);
1522}
1523EXPORT_SYMBOL_GPL(xts_camellia_setkey);
1524
1525static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1526 struct scatterlist *src, unsigned int nbytes)
1527{
1528 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1529 be128 buf[2 * 4];
1530 struct xts_crypt_req req = {
1531 .tbuf = buf,
1532 .tbuflen = sizeof(buf),
1533
1534 .tweak_ctx = &ctx->tweak_ctx,
1535 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
1536 .crypt_ctx = &ctx->crypt_ctx,
1537 .crypt_fn = encrypt_callback,
1538 };
1539
1540 return xts_crypt(desc, dst, src, nbytes, &req);
1541}
1542
1543static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1544 struct scatterlist *src, unsigned int nbytes)
1545{
1546 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1547 be128 buf[2 * 4];
1548 struct xts_crypt_req req = {
1549 .tbuf = buf,
1550 .tbuflen = sizeof(buf),
1551
1552 .tweak_ctx = &ctx->tweak_ctx,
1553 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
1554 .crypt_ctx = &ctx->crypt_ctx,
1555 .crypt_fn = decrypt_callback,
1556 };
1557
1558 return xts_crypt(desc, dst, src, nbytes, &req);
1559}
1560
1561static struct crypto_alg camellia_algs[6] = { {
1562 .cra_name = "camellia",
1563 .cra_driver_name = "camellia-asm",
1564 .cra_priority = 200,
1565 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
1566 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
1567 .cra_ctxsize = sizeof(struct camellia_ctx),
1568 .cra_alignmask = 0,
1569 .cra_module = THIS_MODULE,
1570 .cra_u = {
1571 .cipher = {
1572 .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE,
1573 .cia_max_keysize = CAMELLIA_MAX_KEY_SIZE,
1574 .cia_setkey = camellia_setkey,
1575 .cia_encrypt = camellia_encrypt,
1576 .cia_decrypt = camellia_decrypt
1577 }
1578 }
1579}, {
1580 .cra_name = "ecb(camellia)",
1581 .cra_driver_name = "ecb-camellia-asm",
1582 .cra_priority = 300,
1583 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1584 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
1585 .cra_ctxsize = sizeof(struct camellia_ctx),
1586 .cra_alignmask = 0,
1587 .cra_type = &crypto_blkcipher_type,
1588 .cra_module = THIS_MODULE,
1589 .cra_u = {
1590 .blkcipher = {
1591 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
1592 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
1593 .setkey = camellia_setkey,
1594 .encrypt = ecb_encrypt,
1595 .decrypt = ecb_decrypt,
1596 },
1597 },
1598}, {
1599 .cra_name = "cbc(camellia)",
1600 .cra_driver_name = "cbc-camellia-asm",
1601 .cra_priority = 300,
1602 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1603 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
1604 .cra_ctxsize = sizeof(struct camellia_ctx),
1605 .cra_alignmask = 0,
1606 .cra_type = &crypto_blkcipher_type,
1607 .cra_module = THIS_MODULE,
1608 .cra_u = {
1609 .blkcipher = {
1610 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
1611 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
1612 .ivsize = CAMELLIA_BLOCK_SIZE,
1613 .setkey = camellia_setkey,
1614 .encrypt = cbc_encrypt,
1615 .decrypt = cbc_decrypt,
1616 },
1617 },
1618}, {
1619 .cra_name = "ctr(camellia)",
1620 .cra_driver_name = "ctr-camellia-asm",
1621 .cra_priority = 300,
1622 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1623 .cra_blocksize = 1,
1624 .cra_ctxsize = sizeof(struct camellia_ctx),
1625 .cra_alignmask = 0,
1626 .cra_type = &crypto_blkcipher_type,
1627 .cra_module = THIS_MODULE,
1628 .cra_u = {
1629 .blkcipher = {
1630 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
1631 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
1632 .ivsize = CAMELLIA_BLOCK_SIZE,
1633 .setkey = camellia_setkey,
1634 .encrypt = ctr_crypt,
1635 .decrypt = ctr_crypt,
1636 },
1637 },
1638}, {
1639 .cra_name = "lrw(camellia)",
1640 .cra_driver_name = "lrw-camellia-asm",
1641 .cra_priority = 300,
1642 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1643 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
1644 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
1645 .cra_alignmask = 0,
1646 .cra_type = &crypto_blkcipher_type,
1647 .cra_module = THIS_MODULE,
1648 .cra_exit = lrw_camellia_exit_tfm,
1649 .cra_u = {
1650 .blkcipher = {
1651 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
1652 CAMELLIA_BLOCK_SIZE,
1653 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
1654 CAMELLIA_BLOCK_SIZE,
1655 .ivsize = CAMELLIA_BLOCK_SIZE,
1656 .setkey = lrw_camellia_setkey,
1657 .encrypt = lrw_encrypt,
1658 .decrypt = lrw_decrypt,
1659 },
1660 },
1661}, {
1662 .cra_name = "xts(camellia)",
1663 .cra_driver_name = "xts-camellia-asm",
1664 .cra_priority = 300,
1665 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
1666 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
1667 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
1668 .cra_alignmask = 0,
1669 .cra_type = &crypto_blkcipher_type,
1670 .cra_module = THIS_MODULE,
1671 .cra_u = {
1672 .blkcipher = {
1673 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
1674 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
1675 .ivsize = CAMELLIA_BLOCK_SIZE,
1676 .setkey = xts_camellia_setkey,
1677 .encrypt = xts_encrypt,
1678 .decrypt = xts_decrypt,
1679 },
1680 },
1681} };
1682
1683static bool is_blacklisted_cpu(void)
1684{
1685 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
1686 return false;
1687
1688 if (boot_cpu_data.x86 == 0x0f) {
1689 /*
1690 * On Pentium 4, camellia-asm is slower than original assembler
1691 * implementation because excessive uses of 64bit rotate and
1692 * left-shifts (which are really slow on P4) needed to store and
1693 * handle 128bit block in two 64bit registers.
1694 */
1695 return true;
1696 }
1697
1698 return false;
1699}
1700
1701static int force;
1702module_param(force, int, 0);
1703MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
1704
1705static int __init init(void)
1706{
1707 if (!force && is_blacklisted_cpu()) {
1708 printk(KERN_INFO
1709 "camellia-x86_64: performance on this CPU "
1710 "would be suboptimal: disabling "
1711 "camellia-x86_64.\n");
1712 return -ENODEV;
1713 }
1714
1715 return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
1716}
1717
1718static void __exit fini(void)
1719{
1720 crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
1721}
1722
1723module_init(init);
1724module_exit(fini);
1725
1726MODULE_LICENSE("GPL");
1727MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
1728MODULE_ALIAS("camellia");
1729MODULE_ALIAS("camellia-asm");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
deleted file mode 100644
index 15b00ac7cbd..00000000000
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ /dev/null
@@ -1,558 +0,0 @@
1/*
2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * USA
23 *
24 */
25
26.file "cast5-avx-x86_64-asm_64.S"
27
28.extern cast_s1
29.extern cast_s2
30.extern cast_s3
31.extern cast_s4
32
33/* structure of crypto context */
34#define km 0
35#define kr (16*4)
36#define rr ((16*4)+16)
37
38/* s-boxes */
39#define s1 cast_s1
40#define s2 cast_s2
41#define s3 cast_s3
42#define s4 cast_s4
43
44/**********************************************************************
45 16-way AVX cast5
46 **********************************************************************/
47#define CTX %rdi
48
49#define RL1 %xmm0
50#define RR1 %xmm1
51#define RL2 %xmm2
52#define RR2 %xmm3
53#define RL3 %xmm4
54#define RR3 %xmm5
55#define RL4 %xmm6
56#define RR4 %xmm7
57
58#define RX %xmm8
59
60#define RKM %xmm9
61#define RKR %xmm10
62#define RKRF %xmm11
63#define RKRR %xmm12
64
65#define R32 %xmm13
66#define R1ST %xmm14
67
68#define RTMP %xmm15
69
70#define RID1 %rbp
71#define RID1d %ebp
72#define RID2 %rsi
73#define RID2d %esi
74
75#define RGI1 %rdx
76#define RGI1bl %dl
77#define RGI1bh %dh
78#define RGI2 %rcx
79#define RGI2bl %cl
80#define RGI2bh %ch
81
82#define RGI3 %rax
83#define RGI3bl %al
84#define RGI3bh %ah
85#define RGI4 %rbx
86#define RGI4bl %bl
87#define RGI4bh %bh
88
89#define RFS1 %r8
90#define RFS1d %r8d
91#define RFS2 %r9
92#define RFS2d %r9d
93#define RFS3 %r10
94#define RFS3d %r10d
95
96
97#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
98 movzbl src ## bh, RID1d; \
99 movzbl src ## bl, RID2d; \
100 shrq $16, src; \
101 movl s1(, RID1, 4), dst ## d; \
102 op1 s2(, RID2, 4), dst ## d; \
103 movzbl src ## bh, RID1d; \
104 movzbl src ## bl, RID2d; \
105 interleave_op(il_reg); \
106 op2 s3(, RID1, 4), dst ## d; \
107 op3 s4(, RID2, 4), dst ## d;
108
109#define dummy(d) /* do nothing */
110
111#define shr_next(reg) \
112 shrq $16, reg;
113
114#define F_head(a, x, gi1, gi2, op0) \
115 op0 a, RKM, x; \
116 vpslld RKRF, x, RTMP; \
117 vpsrld RKRR, x, x; \
118 vpor RTMP, x, x; \
119 \
120 vmovq x, gi1; \
121 vpextrq $1, x, gi2;
122
123#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
124 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
125 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
126 \
127 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
128 shlq $32, RFS2; \
129 orq RFS1, RFS2; \
130 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
131 shlq $32, RFS1; \
132 orq RFS1, RFS3; \
133 \
134 vmovq RFS2, x; \
135 vpinsrq $1, RFS3, x, x;
136
137#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
138 F_head(b1, RX, RGI1, RGI2, op0); \
139 F_head(b2, RX, RGI3, RGI4, op0); \
140 \
141 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
142 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
143 \
144 vpxor a1, RX, a1; \
145 vpxor a2, RTMP, a2;
146
147#define F1_2(a1, b1, a2, b2) \
148 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
149#define F2_2(a1, b1, a2, b2) \
150 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
151#define F3_2(a1, b1, a2, b2) \
152 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
153
154#define subround(a1, b1, a2, b2, f) \
155 F ## f ## _2(a1, b1, a2, b2);
156
157#define round(l, r, n, f) \
158 vbroadcastss (km+(4*n))(CTX), RKM; \
159 vpand R1ST, RKR, RKRF; \
160 vpsubq RKRF, R32, RKRR; \
161 vpsrldq $1, RKR, RKR; \
162 subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
163 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
164
165#define enc_preload_rkr() \
166 vbroadcastss .L16_mask, RKR; \
167 /* add 16-bit rotation to key rotations (mod 32) */ \
168 vpxor kr(CTX), RKR, RKR;
169
170#define dec_preload_rkr() \
171 vbroadcastss .L16_mask, RKR; \
172 /* add 16-bit rotation to key rotations (mod 32) */ \
173 vpxor kr(CTX), RKR, RKR; \
174 vpshufb .Lbswap128_mask, RKR, RKR;
175
176#define transpose_2x4(x0, x1, t0, t1) \
177 vpunpckldq x1, x0, t0; \
178 vpunpckhdq x1, x0, t1; \
179 \
180 vpunpcklqdq t1, t0, x0; \
181 vpunpckhqdq t1, t0, x1;
182
183#define inpack_blocks(x0, x1, t0, t1, rmask) \
184 vpshufb rmask, x0, x0; \
185 vpshufb rmask, x1, x1; \
186 \
187 transpose_2x4(x0, x1, t0, t1)
188
189#define outunpack_blocks(x0, x1, t0, t1, rmask) \
190 transpose_2x4(x0, x1, t0, t1) \
191 \
192 vpshufb rmask, x0, x0; \
193 vpshufb rmask, x1, x1;
194
195.data
196
197.align 16
198.Lbswap_mask:
199 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
200.Lbswap128_mask:
201 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
202.Lbswap_iv_mask:
203 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
204.L16_mask:
205 .byte 16, 16, 16, 16
206.L32_mask:
207 .byte 32, 0, 0, 0
208.Lfirst_mask:
209 .byte 0x1f, 0, 0, 0
210
211.text
212
213.align 16
214.type __cast5_enc_blk16,@function;
215
216__cast5_enc_blk16:
217 /* input:
218 * %rdi: ctx, CTX
219 * RL1: blocks 1 and 2
220 * RR1: blocks 3 and 4
221 * RL2: blocks 5 and 6
222 * RR2: blocks 7 and 8
223 * RL3: blocks 9 and 10
224 * RR3: blocks 11 and 12
225 * RL4: blocks 13 and 14
226 * RR4: blocks 15 and 16
227 * output:
228 * RL1: encrypted blocks 1 and 2
229 * RR1: encrypted blocks 3 and 4
230 * RL2: encrypted blocks 5 and 6
231 * RR2: encrypted blocks 7 and 8
232 * RL3: encrypted blocks 9 and 10
233 * RR3: encrypted blocks 11 and 12
234 * RL4: encrypted blocks 13 and 14
235 * RR4: encrypted blocks 15 and 16
236 */
237
238 pushq %rbp;
239 pushq %rbx;
240
241 vmovdqa .Lbswap_mask, RKM;
242 vmovd .Lfirst_mask, R1ST;
243 vmovd .L32_mask, R32;
244 enc_preload_rkr();
245
246 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
247 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
248 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
249 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
250
251 round(RL, RR, 0, 1);
252 round(RR, RL, 1, 2);
253 round(RL, RR, 2, 3);
254 round(RR, RL, 3, 1);
255 round(RL, RR, 4, 2);
256 round(RR, RL, 5, 3);
257 round(RL, RR, 6, 1);
258 round(RR, RL, 7, 2);
259 round(RL, RR, 8, 3);
260 round(RR, RL, 9, 1);
261 round(RL, RR, 10, 2);
262 round(RR, RL, 11, 3);
263
264 movzbl rr(CTX), %eax;
265 testl %eax, %eax;
266 jnz __skip_enc;
267
268 round(RL, RR, 12, 1);
269 round(RR, RL, 13, 2);
270 round(RL, RR, 14, 3);
271 round(RR, RL, 15, 1);
272
273__skip_enc:
274 popq %rbx;
275 popq %rbp;
276
277 vmovdqa .Lbswap_mask, RKM;
278
279 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
280 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
281 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
282 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
283
284 ret;
285
286.align 16
287.type __cast5_dec_blk16,@function;
288
289__cast5_dec_blk16:
290 /* input:
291 * %rdi: ctx, CTX
292 * RL1: encrypted blocks 1 and 2
293 * RR1: encrypted blocks 3 and 4
294 * RL2: encrypted blocks 5 and 6
295 * RR2: encrypted blocks 7 and 8
296 * RL3: encrypted blocks 9 and 10
297 * RR3: encrypted blocks 11 and 12
298 * RL4: encrypted blocks 13 and 14
299 * RR4: encrypted blocks 15 and 16
300 * output:
301 * RL1: decrypted blocks 1 and 2
302 * RR1: decrypted blocks 3 and 4
303 * RL2: decrypted blocks 5 and 6
304 * RR2: decrypted blocks 7 and 8
305 * RL3: decrypted blocks 9 and 10
306 * RR3: decrypted blocks 11 and 12
307 * RL4: decrypted blocks 13 and 14
308 * RR4: decrypted blocks 15 and 16
309 */
310
311 pushq %rbp;
312 pushq %rbx;
313
314 vmovdqa .Lbswap_mask, RKM;
315 vmovd .Lfirst_mask, R1ST;
316 vmovd .L32_mask, R32;
317 dec_preload_rkr();
318
319 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
320 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
321 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
322 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
323
324 movzbl rr(CTX), %eax;
325 testl %eax, %eax;
326 jnz __skip_dec;
327
328 round(RL, RR, 15, 1);
329 round(RR, RL, 14, 3);
330 round(RL, RR, 13, 2);
331 round(RR, RL, 12, 1);
332
333__dec_tail:
334 round(RL, RR, 11, 3);
335 round(RR, RL, 10, 2);
336 round(RL, RR, 9, 1);
337 round(RR, RL, 8, 3);
338 round(RL, RR, 7, 2);
339 round(RR, RL, 6, 1);
340 round(RL, RR, 5, 3);
341 round(RR, RL, 4, 2);
342 round(RL, RR, 3, 1);
343 round(RR, RL, 2, 3);
344 round(RL, RR, 1, 2);
345 round(RR, RL, 0, 1);
346
347 vmovdqa .Lbswap_mask, RKM;
348 popq %rbx;
349 popq %rbp;
350
351 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
352 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
353 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
354 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
355
356 ret;
357
358__skip_dec:
359 vpsrldq $4, RKR, RKR;
360 jmp __dec_tail;
361
362.align 16
363.global cast5_ecb_enc_16way
364.type cast5_ecb_enc_16way,@function;
365
366cast5_ecb_enc_16way:
367 /* input:
368 * %rdi: ctx, CTX
369 * %rsi: dst
370 * %rdx: src
371 */
372
373 movq %rsi, %r11;
374
375 vmovdqu (0*4*4)(%rdx), RL1;
376 vmovdqu (1*4*4)(%rdx), RR1;
377 vmovdqu (2*4*4)(%rdx), RL2;
378 vmovdqu (3*4*4)(%rdx), RR2;
379 vmovdqu (4*4*4)(%rdx), RL3;
380 vmovdqu (5*4*4)(%rdx), RR3;
381 vmovdqu (6*4*4)(%rdx), RL4;
382 vmovdqu (7*4*4)(%rdx), RR4;
383
384 call __cast5_enc_blk16;
385
386 vmovdqu RR1, (0*4*4)(%r11);
387 vmovdqu RL1, (1*4*4)(%r11);
388 vmovdqu RR2, (2*4*4)(%r11);
389 vmovdqu RL2, (3*4*4)(%r11);
390 vmovdqu RR3, (4*4*4)(%r11);
391 vmovdqu RL3, (5*4*4)(%r11);
392 vmovdqu RR4, (6*4*4)(%r11);
393 vmovdqu RL4, (7*4*4)(%r11);
394
395 ret;
396
397.align 16
398.global cast5_ecb_dec_16way
399.type cast5_ecb_dec_16way,@function;
400
401cast5_ecb_dec_16way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 */
407
408 movq %rsi, %r11;
409
410 vmovdqu (0*4*4)(%rdx), RL1;
411 vmovdqu (1*4*4)(%rdx), RR1;
412 vmovdqu (2*4*4)(%rdx), RL2;
413 vmovdqu (3*4*4)(%rdx), RR2;
414 vmovdqu (4*4*4)(%rdx), RL3;
415 vmovdqu (5*4*4)(%rdx), RR3;
416 vmovdqu (6*4*4)(%rdx), RL4;
417 vmovdqu (7*4*4)(%rdx), RR4;
418
419 call __cast5_dec_blk16;
420
421 vmovdqu RR1, (0*4*4)(%r11);
422 vmovdqu RL1, (1*4*4)(%r11);
423 vmovdqu RR2, (2*4*4)(%r11);
424 vmovdqu RL2, (3*4*4)(%r11);
425 vmovdqu RR3, (4*4*4)(%r11);
426 vmovdqu RL3, (5*4*4)(%r11);
427 vmovdqu RR4, (6*4*4)(%r11);
428 vmovdqu RL4, (7*4*4)(%r11);
429
430 ret;
431
432.align 16
433.global cast5_cbc_dec_16way
434.type cast5_cbc_dec_16way,@function;
435
436cast5_cbc_dec_16way:
437 /* input:
438 * %rdi: ctx, CTX
439 * %rsi: dst
440 * %rdx: src
441 */
442
443 pushq %r12;
444
445 movq %rsi, %r11;
446 movq %rdx, %r12;
447
448 vmovdqu (0*16)(%rdx), RL1;
449 vmovdqu (1*16)(%rdx), RR1;
450 vmovdqu (2*16)(%rdx), RL2;
451 vmovdqu (3*16)(%rdx), RR2;
452 vmovdqu (4*16)(%rdx), RL3;
453 vmovdqu (5*16)(%rdx), RR3;
454 vmovdqu (6*16)(%rdx), RL4;
455 vmovdqu (7*16)(%rdx), RR4;
456
457 call __cast5_dec_blk16;
458
459 /* xor with src */
460 vmovq (%r12), RX;
461 vpshufd $0x4f, RX, RX;
462 vpxor RX, RR1, RR1;
463 vpxor 0*16+8(%r12), RL1, RL1;
464 vpxor 1*16+8(%r12), RR2, RR2;
465 vpxor 2*16+8(%r12), RL2, RL2;
466 vpxor 3*16+8(%r12), RR3, RR3;
467 vpxor 4*16+8(%r12), RL3, RL3;
468 vpxor 5*16+8(%r12), RR4, RR4;
469 vpxor 6*16+8(%r12), RL4, RL4;
470
471 vmovdqu RR1, (0*16)(%r11);
472 vmovdqu RL1, (1*16)(%r11);
473 vmovdqu RR2, (2*16)(%r11);
474 vmovdqu RL2, (3*16)(%r11);
475 vmovdqu RR3, (4*16)(%r11);
476 vmovdqu RL3, (5*16)(%r11);
477 vmovdqu RR4, (6*16)(%r11);
478 vmovdqu RL4, (7*16)(%r11);
479
480 popq %r12;
481
482 ret;
483
484.align 16
485.global cast5_ctr_16way
486.type cast5_ctr_16way,@function;
487
488cast5_ctr_16way:
489 /* input:
490 * %rdi: ctx, CTX
491 * %rsi: dst
492 * %rdx: src
493 * %rcx: iv (big endian, 64bit)
494 */
495
496 pushq %r12;
497
498 movq %rsi, %r11;
499 movq %rdx, %r12;
500
501 vpcmpeqd RTMP, RTMP, RTMP;
502 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
503
504 vpcmpeqd RKR, RKR, RKR;
505 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
506 vmovdqa .Lbswap_iv_mask, R1ST;
507 vmovdqa .Lbswap128_mask, RKM;
508
509 /* load IV and byteswap */
510 vmovq (%rcx), RX;
511 vpshufb R1ST, RX, RX;
512
513 /* construct IVs */
514 vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
515 vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
516 vpsubq RKR, RX, RX;
517 vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
518 vpsubq RKR, RX, RX;
519 vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
520 vpsubq RKR, RX, RX;
521 vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
522 vpsubq RKR, RX, RX;
523 vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
524 vpsubq RKR, RX, RX;
525 vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
526 vpsubq RKR, RX, RX;
527 vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
528 vpsubq RKR, RX, RX;
529 vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
530
531 /* store last IV */
532 vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
533 vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
534 vmovq RX, (%rcx);
535
536 call __cast5_enc_blk16;
537
538 /* dst = src ^ iv */
539 vpxor (0*16)(%r12), RR1, RR1;
540 vpxor (1*16)(%r12), RL1, RL1;
541 vpxor (2*16)(%r12), RR2, RR2;
542 vpxor (3*16)(%r12), RL2, RL2;
543 vpxor (4*16)(%r12), RR3, RR3;
544 vpxor (5*16)(%r12), RL3, RL3;
545 vpxor (6*16)(%r12), RR4, RR4;
546 vpxor (7*16)(%r12), RL4, RL4;
547 vmovdqu RR1, (0*16)(%r11);
548 vmovdqu RL1, (1*16)(%r11);
549 vmovdqu RR2, (2*16)(%r11);
550 vmovdqu RL2, (3*16)(%r11);
551 vmovdqu RR3, (4*16)(%r11);
552 vmovdqu RL3, (5*16)(%r11);
553 vmovdqu RR4, (6*16)(%r11);
554 vmovdqu RL4, (7*16)(%r11);
555
556 popq %r12;
557
558 ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
deleted file mode 100644
index c6631813dc1..00000000000
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ /dev/null
@@ -1,497 +0,0 @@
1/*
2 * Glue Code for the AVX assembler implemention of the Cast5 Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/cast5.h>
31#include <crypto/cryptd.h>
32#include <crypto/ctr.h>
33#include <asm/xcr.h>
34#include <asm/xsave.h>
35#include <asm/crypto/ablk_helper.h>
36#include <asm/crypto/glue_helper.h>
37
38#define CAST5_PARALLEL_BLOCKS 16
39
40asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
41 const u8 *src);
42asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
43 const u8 *src);
44asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
45 const u8 *src);
46asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
47 __be64 *iv);
48
49static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
50{
51 return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
52 NULL, fpu_enabled, nbytes);
53}
54
55static inline void cast5_fpu_end(bool fpu_enabled)
56{
57 return glue_fpu_end(fpu_enabled);
58}
59
60static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
61 bool enc)
62{
63 bool fpu_enabled = false;
64 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
65 const unsigned int bsize = CAST5_BLOCK_SIZE;
66 unsigned int nbytes;
67 void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
68 int err;
69
70 fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
71
72 err = blkcipher_walk_virt(desc, walk);
73 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
74
75 while ((nbytes = walk->nbytes)) {
76 u8 *wsrc = walk->src.virt.addr;
77 u8 *wdst = walk->dst.virt.addr;
78
79 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
80
81 /* Process multi-block batch */
82 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
83 do {
84 fn(ctx, wdst, wsrc);
85
86 wsrc += bsize * CAST5_PARALLEL_BLOCKS;
87 wdst += bsize * CAST5_PARALLEL_BLOCKS;
88 nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
89 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
90
91 if (nbytes < bsize)
92 goto done;
93 }
94
95 fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
96
97 /* Handle leftovers */
98 do {
99 fn(ctx, wdst, wsrc);
100
101 wsrc += bsize;
102 wdst += bsize;
103 nbytes -= bsize;
104 } while (nbytes >= bsize);
105
106done:
107 err = blkcipher_walk_done(desc, walk, nbytes);
108 }
109
110 cast5_fpu_end(fpu_enabled);
111 return err;
112}
113
114static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
115 struct scatterlist *src, unsigned int nbytes)
116{
117 struct blkcipher_walk walk;
118
119 blkcipher_walk_init(&walk, dst, src, nbytes);
120 return ecb_crypt(desc, &walk, true);
121}
122
123static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
124 struct scatterlist *src, unsigned int nbytes)
125{
126 struct blkcipher_walk walk;
127
128 blkcipher_walk_init(&walk, dst, src, nbytes);
129 return ecb_crypt(desc, &walk, false);
130}
131
132static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
133 struct blkcipher_walk *walk)
134{
135 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
136 const unsigned int bsize = CAST5_BLOCK_SIZE;
137 unsigned int nbytes = walk->nbytes;
138 u64 *src = (u64 *)walk->src.virt.addr;
139 u64 *dst = (u64 *)walk->dst.virt.addr;
140 u64 *iv = (u64 *)walk->iv;
141
142 do {
143 *dst = *src ^ *iv;
144 __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
145 iv = dst;
146
147 src += 1;
148 dst += 1;
149 nbytes -= bsize;
150 } while (nbytes >= bsize);
151
152 *(u64 *)walk->iv = *iv;
153 return nbytes;
154}
155
156static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
157 struct scatterlist *src, unsigned int nbytes)
158{
159 struct blkcipher_walk walk;
160 int err;
161
162 blkcipher_walk_init(&walk, dst, src, nbytes);
163 err = blkcipher_walk_virt(desc, &walk);
164
165 while ((nbytes = walk.nbytes)) {
166 nbytes = __cbc_encrypt(desc, &walk);
167 err = blkcipher_walk_done(desc, &walk, nbytes);
168 }
169
170 return err;
171}
172
173static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
174 struct blkcipher_walk *walk)
175{
176 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
177 const unsigned int bsize = CAST5_BLOCK_SIZE;
178 unsigned int nbytes = walk->nbytes;
179 u64 *src = (u64 *)walk->src.virt.addr;
180 u64 *dst = (u64 *)walk->dst.virt.addr;
181 u64 last_iv;
182
183 /* Start of the last block. */
184 src += nbytes / bsize - 1;
185 dst += nbytes / bsize - 1;
186
187 last_iv = *src;
188
189 /* Process multi-block batch */
190 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
191 do {
192 nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
193 src -= CAST5_PARALLEL_BLOCKS - 1;
194 dst -= CAST5_PARALLEL_BLOCKS - 1;
195
196 cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
197
198 nbytes -= bsize;
199 if (nbytes < bsize)
200 goto done;
201
202 *dst ^= *(src - 1);
203 src -= 1;
204 dst -= 1;
205 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
206
207 if (nbytes < bsize)
208 goto done;
209 }
210
211 /* Handle leftovers */
212 for (;;) {
213 __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
214
215 nbytes -= bsize;
216 if (nbytes < bsize)
217 break;
218
219 *dst ^= *(src - 1);
220 src -= 1;
221 dst -= 1;
222 }
223
224done:
225 *dst ^= *(u64 *)walk->iv;
226 *(u64 *)walk->iv = last_iv;
227
228 return nbytes;
229}
230
231static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
232 struct scatterlist *src, unsigned int nbytes)
233{
234 bool fpu_enabled = false;
235 struct blkcipher_walk walk;
236 int err;
237
238 blkcipher_walk_init(&walk, dst, src, nbytes);
239 err = blkcipher_walk_virt(desc, &walk);
240 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
241
242 while ((nbytes = walk.nbytes)) {
243 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
244 nbytes = __cbc_decrypt(desc, &walk);
245 err = blkcipher_walk_done(desc, &walk, nbytes);
246 }
247
248 cast5_fpu_end(fpu_enabled);
249 return err;
250}
251
252static void ctr_crypt_final(struct blkcipher_desc *desc,
253 struct blkcipher_walk *walk)
254{
255 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
256 u8 *ctrblk = walk->iv;
257 u8 keystream[CAST5_BLOCK_SIZE];
258 u8 *src = walk->src.virt.addr;
259 u8 *dst = walk->dst.virt.addr;
260 unsigned int nbytes = walk->nbytes;
261
262 __cast5_encrypt(ctx, keystream, ctrblk);
263 crypto_xor(keystream, src, nbytes);
264 memcpy(dst, keystream, nbytes);
265
266 crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
267}
268
269static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
270 struct blkcipher_walk *walk)
271{
272 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
273 const unsigned int bsize = CAST5_BLOCK_SIZE;
274 unsigned int nbytes = walk->nbytes;
275 u64 *src = (u64 *)walk->src.virt.addr;
276 u64 *dst = (u64 *)walk->dst.virt.addr;
277
278 /* Process multi-block batch */
279 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
280 do {
281 cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
282 (__be64 *)walk->iv);
283
284 src += CAST5_PARALLEL_BLOCKS;
285 dst += CAST5_PARALLEL_BLOCKS;
286 nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
287 } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
288
289 if (nbytes < bsize)
290 goto done;
291 }
292
293 /* Handle leftovers */
294 do {
295 u64 ctrblk;
296
297 if (dst != src)
298 *dst = *src;
299
300 ctrblk = *(u64 *)walk->iv;
301 be64_add_cpu((__be64 *)walk->iv, 1);
302
303 __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
304 *dst ^= ctrblk;
305
306 src += 1;
307 dst += 1;
308 nbytes -= bsize;
309 } while (nbytes >= bsize);
310
311done:
312 return nbytes;
313}
314
315static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
316 struct scatterlist *src, unsigned int nbytes)
317{
318 bool fpu_enabled = false;
319 struct blkcipher_walk walk;
320 int err;
321
322 blkcipher_walk_init(&walk, dst, src, nbytes);
323 err = blkcipher_walk_virt_block(desc, &walk, CAST5_BLOCK_SIZE);
324 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
325
326 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
327 fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
328 nbytes = __ctr_crypt(desc, &walk);
329 err = blkcipher_walk_done(desc, &walk, nbytes);
330 }
331
332 cast5_fpu_end(fpu_enabled);
333
334 if (walk.nbytes) {
335 ctr_crypt_final(desc, &walk);
336 err = blkcipher_walk_done(desc, &walk, 0);
337 }
338
339 return err;
340}
341
342
343static struct crypto_alg cast5_algs[6] = { {
344 .cra_name = "__ecb-cast5-avx",
345 .cra_driver_name = "__driver-ecb-cast5-avx",
346 .cra_priority = 0,
347 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
348 .cra_blocksize = CAST5_BLOCK_SIZE,
349 .cra_ctxsize = sizeof(struct cast5_ctx),
350 .cra_alignmask = 0,
351 .cra_type = &crypto_blkcipher_type,
352 .cra_module = THIS_MODULE,
353 .cra_u = {
354 .blkcipher = {
355 .min_keysize = CAST5_MIN_KEY_SIZE,
356 .max_keysize = CAST5_MAX_KEY_SIZE,
357 .setkey = cast5_setkey,
358 .encrypt = ecb_encrypt,
359 .decrypt = ecb_decrypt,
360 },
361 },
362}, {
363 .cra_name = "__cbc-cast5-avx",
364 .cra_driver_name = "__driver-cbc-cast5-avx",
365 .cra_priority = 0,
366 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
367 .cra_blocksize = CAST5_BLOCK_SIZE,
368 .cra_ctxsize = sizeof(struct cast5_ctx),
369 .cra_alignmask = 0,
370 .cra_type = &crypto_blkcipher_type,
371 .cra_module = THIS_MODULE,
372 .cra_u = {
373 .blkcipher = {
374 .min_keysize = CAST5_MIN_KEY_SIZE,
375 .max_keysize = CAST5_MAX_KEY_SIZE,
376 .setkey = cast5_setkey,
377 .encrypt = cbc_encrypt,
378 .decrypt = cbc_decrypt,
379 },
380 },
381}, {
382 .cra_name = "__ctr-cast5-avx",
383 .cra_driver_name = "__driver-ctr-cast5-avx",
384 .cra_priority = 0,
385 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
386 .cra_blocksize = 1,
387 .cra_ctxsize = sizeof(struct cast5_ctx),
388 .cra_alignmask = 0,
389 .cra_type = &crypto_blkcipher_type,
390 .cra_module = THIS_MODULE,
391 .cra_u = {
392 .blkcipher = {
393 .min_keysize = CAST5_MIN_KEY_SIZE,
394 .max_keysize = CAST5_MAX_KEY_SIZE,
395 .ivsize = CAST5_BLOCK_SIZE,
396 .setkey = cast5_setkey,
397 .encrypt = ctr_crypt,
398 .decrypt = ctr_crypt,
399 },
400 },
401}, {
402 .cra_name = "ecb(cast5)",
403 .cra_driver_name = "ecb-cast5-avx",
404 .cra_priority = 200,
405 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
406 .cra_blocksize = CAST5_BLOCK_SIZE,
407 .cra_ctxsize = sizeof(struct async_helper_ctx),
408 .cra_alignmask = 0,
409 .cra_type = &crypto_ablkcipher_type,
410 .cra_module = THIS_MODULE,
411 .cra_init = ablk_init,
412 .cra_exit = ablk_exit,
413 .cra_u = {
414 .ablkcipher = {
415 .min_keysize = CAST5_MIN_KEY_SIZE,
416 .max_keysize = CAST5_MAX_KEY_SIZE,
417 .setkey = ablk_set_key,
418 .encrypt = ablk_encrypt,
419 .decrypt = ablk_decrypt,
420 },
421 },
422}, {
423 .cra_name = "cbc(cast5)",
424 .cra_driver_name = "cbc-cast5-avx",
425 .cra_priority = 200,
426 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
427 .cra_blocksize = CAST5_BLOCK_SIZE,
428 .cra_ctxsize = sizeof(struct async_helper_ctx),
429 .cra_alignmask = 0,
430 .cra_type = &crypto_ablkcipher_type,
431 .cra_module = THIS_MODULE,
432 .cra_init = ablk_init,
433 .cra_exit = ablk_exit,
434 .cra_u = {
435 .ablkcipher = {
436 .min_keysize = CAST5_MIN_KEY_SIZE,
437 .max_keysize = CAST5_MAX_KEY_SIZE,
438 .ivsize = CAST5_BLOCK_SIZE,
439 .setkey = ablk_set_key,
440 .encrypt = __ablk_encrypt,
441 .decrypt = ablk_decrypt,
442 },
443 },
444}, {
445 .cra_name = "ctr(cast5)",
446 .cra_driver_name = "ctr-cast5-avx",
447 .cra_priority = 200,
448 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
449 .cra_blocksize = 1,
450 .cra_ctxsize = sizeof(struct async_helper_ctx),
451 .cra_alignmask = 0,
452 .cra_type = &crypto_ablkcipher_type,
453 .cra_module = THIS_MODULE,
454 .cra_init = ablk_init,
455 .cra_exit = ablk_exit,
456 .cra_u = {
457 .ablkcipher = {
458 .min_keysize = CAST5_MIN_KEY_SIZE,
459 .max_keysize = CAST5_MAX_KEY_SIZE,
460 .ivsize = CAST5_BLOCK_SIZE,
461 .setkey = ablk_set_key,
462 .encrypt = ablk_encrypt,
463 .decrypt = ablk_encrypt,
464 .geniv = "chainiv",
465 },
466 },
467} };
468
469static int __init cast5_init(void)
470{
471 u64 xcr0;
472
473 if (!cpu_has_avx || !cpu_has_osxsave) {
474 pr_info("AVX instructions are not detected.\n");
475 return -ENODEV;
476 }
477
478 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
479 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
480 pr_info("AVX detected but unusable.\n");
481 return -ENODEV;
482 }
483
484 return crypto_register_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
485}
486
487static void __exit cast5_exit(void)
488{
489 crypto_unregister_algs(cast5_algs, ARRAY_SIZE(cast5_algs));
490}
491
492module_init(cast5_init);
493module_exit(cast5_exit);
494
495MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
496MODULE_LICENSE("GPL");
497MODULE_ALIAS("cast5");
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
deleted file mode 100644
index 2569d0da841..00000000000
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ /dev/null
@@ -1,439 +0,0 @@
1/*
2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * USA
23 *
24 */
25
26#include "glue_helper-asm-avx.S"
27
28.file "cast6-avx-x86_64-asm_64.S"
29
30.extern cast_s1
31.extern cast_s2
32.extern cast_s3
33.extern cast_s4
34
35/* structure of crypto context */
36#define km 0
37#define kr (12*4*4)
38
39/* s-boxes */
40#define s1 cast_s1
41#define s2 cast_s2
42#define s3 cast_s3
43#define s4 cast_s4
44
45/**********************************************************************
46 8-way AVX cast6
47 **********************************************************************/
48#define CTX %rdi
49
50#define RA1 %xmm0
51#define RB1 %xmm1
52#define RC1 %xmm2
53#define RD1 %xmm3
54
55#define RA2 %xmm4
56#define RB2 %xmm5
57#define RC2 %xmm6
58#define RD2 %xmm7
59
60#define RX %xmm8
61
62#define RKM %xmm9
63#define RKR %xmm10
64#define RKRF %xmm11
65#define RKRR %xmm12
66#define R32 %xmm13
67#define R1ST %xmm14
68
69#define RTMP %xmm15
70
71#define RID1 %rbp
72#define RID1d %ebp
73#define RID2 %rsi
74#define RID2d %esi
75
76#define RGI1 %rdx
77#define RGI1bl %dl
78#define RGI1bh %dh
79#define RGI2 %rcx
80#define RGI2bl %cl
81#define RGI2bh %ch
82
83#define RGI3 %rax
84#define RGI3bl %al
85#define RGI3bh %ah
86#define RGI4 %rbx
87#define RGI4bl %bl
88#define RGI4bh %bh
89
90#define RFS1 %r8
91#define RFS1d %r8d
92#define RFS2 %r9
93#define RFS2d %r9d
94#define RFS3 %r10
95#define RFS3d %r10d
96
97
98#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
99 movzbl src ## bh, RID1d; \
100 movzbl src ## bl, RID2d; \
101 shrq $16, src; \
102 movl s1(, RID1, 4), dst ## d; \
103 op1 s2(, RID2, 4), dst ## d; \
104 movzbl src ## bh, RID1d; \
105 movzbl src ## bl, RID2d; \
106 interleave_op(il_reg); \
107 op2 s3(, RID1, 4), dst ## d; \
108 op3 s4(, RID2, 4), dst ## d;
109
110#define dummy(d) /* do nothing */
111
112#define shr_next(reg) \
113 shrq $16, reg;
114
115#define F_head(a, x, gi1, gi2, op0) \
116 op0 a, RKM, x; \
117 vpslld RKRF, x, RTMP; \
118 vpsrld RKRR, x, x; \
119 vpor RTMP, x, x; \
120 \
121 vmovq x, gi1; \
122 vpextrq $1, x, gi2;
123
124#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
125 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
126 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
127 \
128 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
129 shlq $32, RFS2; \
130 orq RFS1, RFS2; \
131 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
132 shlq $32, RFS1; \
133 orq RFS1, RFS3; \
134 \
135 vmovq RFS2, x; \
136 vpinsrq $1, RFS3, x, x;
137
138#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
139 F_head(b1, RX, RGI1, RGI2, op0); \
140 F_head(b2, RX, RGI3, RGI4, op0); \
141 \
142 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
143 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
144 \
145 vpxor a1, RX, a1; \
146 vpxor a2, RTMP, a2;
147
148#define F1_2(a1, b1, a2, b2) \
149 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
150#define F2_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
152#define F3_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
154
155#define qop(in, out, f) \
156 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
157
158#define get_round_keys(nn) \
159 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
160 vpand R1ST, RKR, RKRF; \
161 vpsubq RKRF, R32, RKRR; \
162 vpsrldq $1, RKR, RKR;
163
164#define Q(n) \
165 get_round_keys(4*n+0); \
166 qop(RD, RC, 1); \
167 \
168 get_round_keys(4*n+1); \
169 qop(RC, RB, 2); \
170 \
171 get_round_keys(4*n+2); \
172 qop(RB, RA, 3); \
173 \
174 get_round_keys(4*n+3); \
175 qop(RA, RD, 1);
176
177#define QBAR(n) \
178 get_round_keys(4*n+3); \
179 qop(RA, RD, 1); \
180 \
181 get_round_keys(4*n+2); \
182 qop(RB, RA, 3); \
183 \
184 get_round_keys(4*n+1); \
185 qop(RC, RB, 2); \
186 \
187 get_round_keys(4*n+0); \
188 qop(RD, RC, 1);
189
190#define shuffle(mask) \
191 vpshufb mask, RKR, RKR;
192
193#define preload_rkr(n, do_mask, mask) \
194 vbroadcastss .L16_mask, RKR; \
195 /* add 16-bit rotation to key rotations (mod 32) */ \
196 vpxor (kr+n*16)(CTX), RKR, RKR; \
197 do_mask(mask);
198
199#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
200 vpunpckldq x1, x0, t0; \
201 vpunpckhdq x1, x0, t2; \
202 vpunpckldq x3, x2, t1; \
203 vpunpckhdq x3, x2, x3; \
204 \
205 vpunpcklqdq t1, t0, x0; \
206 vpunpckhqdq t1, t0, x1; \
207 vpunpcklqdq x3, t2, x2; \
208 vpunpckhqdq x3, t2, x3;
209
210#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
211 vpshufb rmask, x0, x0; \
212 vpshufb rmask, x1, x1; \
213 vpshufb rmask, x2, x2; \
214 vpshufb rmask, x3, x3; \
215 \
216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
217
218#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
220 \
221 vpshufb rmask, x0, x0; \
222 vpshufb rmask, x1, x1; \
223 vpshufb rmask, x2, x2; \
224 vpshufb rmask, x3, x3;
225
226.data
227
228.align 16
229.Lbswap_mask:
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231.Lbswap128_mask:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
233.Lrkr_enc_Q_Q_QBAR_QBAR:
234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
235.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
236 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
237.Lrkr_dec_Q_Q_Q_Q:
238 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
239.Lrkr_dec_Q_Q_QBAR_QBAR:
240 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
241.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
242 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
243.L16_mask:
244 .byte 16, 16, 16, 16
245.L32_mask:
246 .byte 32, 0, 0, 0
247.Lfirst_mask:
248 .byte 0x1f, 0, 0, 0
249
250.text
251
252.align 8
253.type __cast6_enc_blk8,@function;
254
255__cast6_enc_blk8:
256 /* input:
257 * %rdi: ctx, CTX
258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
259 * output:
260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
261 */
262
263 pushq %rbp;
264 pushq %rbx;
265
266 vmovdqa .Lbswap_mask, RKM;
267 vmovd .Lfirst_mask, R1ST;
268 vmovd .L32_mask, R32;
269
270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
272
273 preload_rkr(0, dummy, none);
274 Q(0);
275 Q(1);
276 Q(2);
277 Q(3);
278 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
279 Q(4);
280 Q(5);
281 QBAR(6);
282 QBAR(7);
283 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
284 QBAR(8);
285 QBAR(9);
286 QBAR(10);
287 QBAR(11);
288
289 popq %rbx;
290 popq %rbp;
291
292 vmovdqa .Lbswap_mask, RKM;
293
294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
296
297 ret;
298
299.align 8
300.type __cast6_dec_blk8,@function;
301
302__cast6_dec_blk8:
303 /* input:
304 * %rdi: ctx, CTX
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
306 * output:
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
308 */
309
310 pushq %rbp;
311 pushq %rbx;
312
313 vmovdqa .Lbswap_mask, RKM;
314 vmovd .Lfirst_mask, R1ST;
315 vmovd .L32_mask, R32;
316
317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
319
320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
321 Q(11);
322 Q(10);
323 Q(9);
324 Q(8);
325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
326 Q(7);
327 Q(6);
328 QBAR(5);
329 QBAR(4);
330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
331 QBAR(3);
332 QBAR(2);
333 QBAR(1);
334 QBAR(0);
335
336 popq %rbx;
337 popq %rbp;
338
339 vmovdqa .Lbswap_mask, RKM;
340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342
343 ret;
344
345.align 8
346.global cast6_ecb_enc_8way
347.type cast6_ecb_enc_8way,@function;
348
349cast6_ecb_enc_8way:
350 /* input:
351 * %rdi: ctx, CTX
352 * %rsi: dst
353 * %rdx: src
354 */
355
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 ret;
365
366.align 8
367.global cast6_ecb_dec_8way
368.type cast6_ecb_dec_8way,@function;
369
370cast6_ecb_dec_8way:
371 /* input:
372 * %rdi: ctx, CTX
373 * %rsi: dst
374 * %rdx: src
375 */
376
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 ret;
386
387.align 8
388.global cast6_cbc_dec_8way
389.type cast6_cbc_dec_8way,@function;
390
391cast6_cbc_dec_8way:
392 /* input:
393 * %rdi: ctx, CTX
394 * %rsi: dst
395 * %rdx: src
396 */
397
398 pushq %r12;
399
400 movq %rsi, %r11;
401 movq %rdx, %r12;
402
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404
405 call __cast6_dec_blk8;
406
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408
409 popq %r12;
410
411 ret;
412
413.align 8
414.global cast6_ctr_8way
415.type cast6_ctr_8way,@function;
416
417cast6_ctr_8way:
418 /* input:
419 * %rdi: ctx, CTX
420 * %rsi: dst
421 * %rdx: src
422 * %rcx: iv (little endian, 128bit)
423 */
424
425 pushq %r12;
426
427 movq %rsi, %r11;
428 movq %rdx, %r12;
429
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
431 RD2, RX, RKR, RKM);
432
433 call __cast6_enc_blk8;
434
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
436
437 popq %r12;
438
439 ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
deleted file mode 100644
index 92f7ca24790..00000000000
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ /dev/null
@@ -1,603 +0,0 @@
1/*
2 * Glue Code for the AVX assembler implemention of the Cast6 Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/cast6.h>
31#include <crypto/cryptd.h>
32#include <crypto/b128ops.h>
33#include <crypto/ctr.h>
34#include <crypto/lrw.h>
35#include <crypto/xts.h>
36#include <asm/xcr.h>
37#include <asm/xsave.h>
38#include <asm/crypto/ablk_helper.h>
39#include <asm/crypto/glue_helper.h>
40
41#define CAST6_PARALLEL_BLOCKS 8
42
43asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
44 const u8 *src);
45asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
46 const u8 *src);
47
48asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
49 const u8 *src);
50asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
51 le128 *iv);
52
53static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
54{
55 be128 ctrblk;
56
57 le128_to_be128(&ctrblk, iv);
58 le128_inc(iv);
59
60 __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
61 u128_xor(dst, src, (u128 *)&ctrblk);
62}
63
64static const struct common_glue_ctx cast6_enc = {
65 .num_funcs = 2,
66 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
67
68 .funcs = { {
69 .num_blocks = CAST6_PARALLEL_BLOCKS,
70 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
71 }, {
72 .num_blocks = 1,
73 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
74 } }
75};
76
77static const struct common_glue_ctx cast6_ctr = {
78 .num_funcs = 2,
79 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
80
81 .funcs = { {
82 .num_blocks = CAST6_PARALLEL_BLOCKS,
83 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
84 }, {
85 .num_blocks = 1,
86 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
87 } }
88};
89
90static const struct common_glue_ctx cast6_dec = {
91 .num_funcs = 2,
92 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
93
94 .funcs = { {
95 .num_blocks = CAST6_PARALLEL_BLOCKS,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
97 }, {
98 .num_blocks = 1,
99 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
100 } }
101};
102
103static const struct common_glue_ctx cast6_dec_cbc = {
104 .num_funcs = 2,
105 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
106
107 .funcs = { {
108 .num_blocks = CAST6_PARALLEL_BLOCKS,
109 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
110 }, {
111 .num_blocks = 1,
112 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
113 } }
114};
115
116static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
117 struct scatterlist *src, unsigned int nbytes)
118{
119 return glue_ecb_crypt_128bit(&cast6_enc, desc, dst, src, nbytes);
120}
121
122static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
123 struct scatterlist *src, unsigned int nbytes)
124{
125 return glue_ecb_crypt_128bit(&cast6_dec, desc, dst, src, nbytes);
126}
127
128static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
129 struct scatterlist *src, unsigned int nbytes)
130{
131 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__cast6_encrypt), desc,
132 dst, src, nbytes);
133}
134
135static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
136 struct scatterlist *src, unsigned int nbytes)
137{
138 return glue_cbc_decrypt_128bit(&cast6_dec_cbc, desc, dst, src,
139 nbytes);
140}
141
142static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
143 struct scatterlist *src, unsigned int nbytes)
144{
145 return glue_ctr_crypt_128bit(&cast6_ctr, desc, dst, src, nbytes);
146}
147
148static inline bool cast6_fpu_begin(bool fpu_enabled, unsigned int nbytes)
149{
150 return glue_fpu_begin(CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS,
151 NULL, fpu_enabled, nbytes);
152}
153
154static inline void cast6_fpu_end(bool fpu_enabled)
155{
156 glue_fpu_end(fpu_enabled);
157}
158
159struct crypt_priv {
160 struct cast6_ctx *ctx;
161 bool fpu_enabled;
162};
163
164static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
165{
166 const unsigned int bsize = CAST6_BLOCK_SIZE;
167 struct crypt_priv *ctx = priv;
168 int i;
169
170 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
171
172 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
173 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
174 return;
175 }
176
177 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
178 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
179}
180
181static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
182{
183 const unsigned int bsize = CAST6_BLOCK_SIZE;
184 struct crypt_priv *ctx = priv;
185 int i;
186
187 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
188
189 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
190 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
191 return;
192 }
193
194 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
195 __cast6_decrypt(ctx->ctx, srcdst, srcdst);
196}
197
198struct cast6_lrw_ctx {
199 struct lrw_table_ctx lrw_table;
200 struct cast6_ctx cast6_ctx;
201};
202
203static int lrw_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
204 unsigned int keylen)
205{
206 struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
207 int err;
208
209 err = __cast6_setkey(&ctx->cast6_ctx, key, keylen - CAST6_BLOCK_SIZE,
210 &tfm->crt_flags);
211 if (err)
212 return err;
213
214 return lrw_init_table(&ctx->lrw_table, key + keylen - CAST6_BLOCK_SIZE);
215}
216
217static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
218 struct scatterlist *src, unsigned int nbytes)
219{
220 struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
221 be128 buf[CAST6_PARALLEL_BLOCKS];
222 struct crypt_priv crypt_ctx = {
223 .ctx = &ctx->cast6_ctx,
224 .fpu_enabled = false,
225 };
226 struct lrw_crypt_req req = {
227 .tbuf = buf,
228 .tbuflen = sizeof(buf),
229
230 .table_ctx = &ctx->lrw_table,
231 .crypt_ctx = &crypt_ctx,
232 .crypt_fn = encrypt_callback,
233 };
234 int ret;
235
236 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
237 ret = lrw_crypt(desc, dst, src, nbytes, &req);
238 cast6_fpu_end(crypt_ctx.fpu_enabled);
239
240 return ret;
241}
242
243static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
244 struct scatterlist *src, unsigned int nbytes)
245{
246 struct cast6_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
247 be128 buf[CAST6_PARALLEL_BLOCKS];
248 struct crypt_priv crypt_ctx = {
249 .ctx = &ctx->cast6_ctx,
250 .fpu_enabled = false,
251 };
252 struct lrw_crypt_req req = {
253 .tbuf = buf,
254 .tbuflen = sizeof(buf),
255
256 .table_ctx = &ctx->lrw_table,
257 .crypt_ctx = &crypt_ctx,
258 .crypt_fn = decrypt_callback,
259 };
260 int ret;
261
262 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
263 ret = lrw_crypt(desc, dst, src, nbytes, &req);
264 cast6_fpu_end(crypt_ctx.fpu_enabled);
265
266 return ret;
267}
268
269static void lrw_exit_tfm(struct crypto_tfm *tfm)
270{
271 struct cast6_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
272
273 lrw_free_table(&ctx->lrw_table);
274}
275
276struct cast6_xts_ctx {
277 struct cast6_ctx tweak_ctx;
278 struct cast6_ctx crypt_ctx;
279};
280
281static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
282 unsigned int keylen)
283{
284 struct cast6_xts_ctx *ctx = crypto_tfm_ctx(tfm);
285 u32 *flags = &tfm->crt_flags;
286 int err;
287
288 /* key consists of keys of equal size concatenated, therefore
289 * the length must be even
290 */
291 if (keylen % 2) {
292 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
293 return -EINVAL;
294 }
295
296 /* first half of xts-key is for crypt */
297 err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
298 if (err)
299 return err;
300
301 /* second half of xts-key is for tweak */
302 return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
303 flags);
304}
305
306static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
307 struct scatterlist *src, unsigned int nbytes)
308{
309 struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
310 be128 buf[CAST6_PARALLEL_BLOCKS];
311 struct crypt_priv crypt_ctx = {
312 .ctx = &ctx->crypt_ctx,
313 .fpu_enabled = false,
314 };
315 struct xts_crypt_req req = {
316 .tbuf = buf,
317 .tbuflen = sizeof(buf),
318
319 .tweak_ctx = &ctx->tweak_ctx,
320 .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
321 .crypt_ctx = &crypt_ctx,
322 .crypt_fn = encrypt_callback,
323 };
324 int ret;
325
326 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
327 ret = xts_crypt(desc, dst, src, nbytes, &req);
328 cast6_fpu_end(crypt_ctx.fpu_enabled);
329
330 return ret;
331}
332
333static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
334 struct scatterlist *src, unsigned int nbytes)
335{
336 struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
337 be128 buf[CAST6_PARALLEL_BLOCKS];
338 struct crypt_priv crypt_ctx = {
339 .ctx = &ctx->crypt_ctx,
340 .fpu_enabled = false,
341 };
342 struct xts_crypt_req req = {
343 .tbuf = buf,
344 .tbuflen = sizeof(buf),
345
346 .tweak_ctx = &ctx->tweak_ctx,
347 .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
348 .crypt_ctx = &crypt_ctx,
349 .crypt_fn = decrypt_callback,
350 };
351 int ret;
352
353 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
354 ret = xts_crypt(desc, dst, src, nbytes, &req);
355 cast6_fpu_end(crypt_ctx.fpu_enabled);
356
357 return ret;
358}
359
360static struct crypto_alg cast6_algs[10] = { {
361 .cra_name = "__ecb-cast6-avx",
362 .cra_driver_name = "__driver-ecb-cast6-avx",
363 .cra_priority = 0,
364 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
365 .cra_blocksize = CAST6_BLOCK_SIZE,
366 .cra_ctxsize = sizeof(struct cast6_ctx),
367 .cra_alignmask = 0,
368 .cra_type = &crypto_blkcipher_type,
369 .cra_module = THIS_MODULE,
370 .cra_u = {
371 .blkcipher = {
372 .min_keysize = CAST6_MIN_KEY_SIZE,
373 .max_keysize = CAST6_MAX_KEY_SIZE,
374 .setkey = cast6_setkey,
375 .encrypt = ecb_encrypt,
376 .decrypt = ecb_decrypt,
377 },
378 },
379}, {
380 .cra_name = "__cbc-cast6-avx",
381 .cra_driver_name = "__driver-cbc-cast6-avx",
382 .cra_priority = 0,
383 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
384 .cra_blocksize = CAST6_BLOCK_SIZE,
385 .cra_ctxsize = sizeof(struct cast6_ctx),
386 .cra_alignmask = 0,
387 .cra_type = &crypto_blkcipher_type,
388 .cra_module = THIS_MODULE,
389 .cra_u = {
390 .blkcipher = {
391 .min_keysize = CAST6_MIN_KEY_SIZE,
392 .max_keysize = CAST6_MAX_KEY_SIZE,
393 .setkey = cast6_setkey,
394 .encrypt = cbc_encrypt,
395 .decrypt = cbc_decrypt,
396 },
397 },
398}, {
399 .cra_name = "__ctr-cast6-avx",
400 .cra_driver_name = "__driver-ctr-cast6-avx",
401 .cra_priority = 0,
402 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
403 .cra_blocksize = 1,
404 .cra_ctxsize = sizeof(struct cast6_ctx),
405 .cra_alignmask = 0,
406 .cra_type = &crypto_blkcipher_type,
407 .cra_module = THIS_MODULE,
408 .cra_u = {
409 .blkcipher = {
410 .min_keysize = CAST6_MIN_KEY_SIZE,
411 .max_keysize = CAST6_MAX_KEY_SIZE,
412 .ivsize = CAST6_BLOCK_SIZE,
413 .setkey = cast6_setkey,
414 .encrypt = ctr_crypt,
415 .decrypt = ctr_crypt,
416 },
417 },
418}, {
419 .cra_name = "__lrw-cast6-avx",
420 .cra_driver_name = "__driver-lrw-cast6-avx",
421 .cra_priority = 0,
422 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
423 .cra_blocksize = CAST6_BLOCK_SIZE,
424 .cra_ctxsize = sizeof(struct cast6_lrw_ctx),
425 .cra_alignmask = 0,
426 .cra_type = &crypto_blkcipher_type,
427 .cra_module = THIS_MODULE,
428 .cra_exit = lrw_exit_tfm,
429 .cra_u = {
430 .blkcipher = {
431 .min_keysize = CAST6_MIN_KEY_SIZE +
432 CAST6_BLOCK_SIZE,
433 .max_keysize = CAST6_MAX_KEY_SIZE +
434 CAST6_BLOCK_SIZE,
435 .ivsize = CAST6_BLOCK_SIZE,
436 .setkey = lrw_cast6_setkey,
437 .encrypt = lrw_encrypt,
438 .decrypt = lrw_decrypt,
439 },
440 },
441}, {
442 .cra_name = "__xts-cast6-avx",
443 .cra_driver_name = "__driver-xts-cast6-avx",
444 .cra_priority = 0,
445 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
446 .cra_blocksize = CAST6_BLOCK_SIZE,
447 .cra_ctxsize = sizeof(struct cast6_xts_ctx),
448 .cra_alignmask = 0,
449 .cra_type = &crypto_blkcipher_type,
450 .cra_module = THIS_MODULE,
451 .cra_u = {
452 .blkcipher = {
453 .min_keysize = CAST6_MIN_KEY_SIZE * 2,
454 .max_keysize = CAST6_MAX_KEY_SIZE * 2,
455 .ivsize = CAST6_BLOCK_SIZE,
456 .setkey = xts_cast6_setkey,
457 .encrypt = xts_encrypt,
458 .decrypt = xts_decrypt,
459 },
460 },
461}, {
462 .cra_name = "ecb(cast6)",
463 .cra_driver_name = "ecb-cast6-avx",
464 .cra_priority = 200,
465 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
466 .cra_blocksize = CAST6_BLOCK_SIZE,
467 .cra_ctxsize = sizeof(struct async_helper_ctx),
468 .cra_alignmask = 0,
469 .cra_type = &crypto_ablkcipher_type,
470 .cra_module = THIS_MODULE,
471 .cra_init = ablk_init,
472 .cra_exit = ablk_exit,
473 .cra_u = {
474 .ablkcipher = {
475 .min_keysize = CAST6_MIN_KEY_SIZE,
476 .max_keysize = CAST6_MAX_KEY_SIZE,
477 .setkey = ablk_set_key,
478 .encrypt = ablk_encrypt,
479 .decrypt = ablk_decrypt,
480 },
481 },
482}, {
483 .cra_name = "cbc(cast6)",
484 .cra_driver_name = "cbc-cast6-avx",
485 .cra_priority = 200,
486 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
487 .cra_blocksize = CAST6_BLOCK_SIZE,
488 .cra_ctxsize = sizeof(struct async_helper_ctx),
489 .cra_alignmask = 0,
490 .cra_type = &crypto_ablkcipher_type,
491 .cra_module = THIS_MODULE,
492 .cra_init = ablk_init,
493 .cra_exit = ablk_exit,
494 .cra_u = {
495 .ablkcipher = {
496 .min_keysize = CAST6_MIN_KEY_SIZE,
497 .max_keysize = CAST6_MAX_KEY_SIZE,
498 .ivsize = CAST6_BLOCK_SIZE,
499 .setkey = ablk_set_key,
500 .encrypt = __ablk_encrypt,
501 .decrypt = ablk_decrypt,
502 },
503 },
504}, {
505 .cra_name = "ctr(cast6)",
506 .cra_driver_name = "ctr-cast6-avx",
507 .cra_priority = 200,
508 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
509 .cra_blocksize = 1,
510 .cra_ctxsize = sizeof(struct async_helper_ctx),
511 .cra_alignmask = 0,
512 .cra_type = &crypto_ablkcipher_type,
513 .cra_module = THIS_MODULE,
514 .cra_init = ablk_init,
515 .cra_exit = ablk_exit,
516 .cra_u = {
517 .ablkcipher = {
518 .min_keysize = CAST6_MIN_KEY_SIZE,
519 .max_keysize = CAST6_MAX_KEY_SIZE,
520 .ivsize = CAST6_BLOCK_SIZE,
521 .setkey = ablk_set_key,
522 .encrypt = ablk_encrypt,
523 .decrypt = ablk_encrypt,
524 .geniv = "chainiv",
525 },
526 },
527}, {
528 .cra_name = "lrw(cast6)",
529 .cra_driver_name = "lrw-cast6-avx",
530 .cra_priority = 200,
531 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
532 .cra_blocksize = CAST6_BLOCK_SIZE,
533 .cra_ctxsize = sizeof(struct async_helper_ctx),
534 .cra_alignmask = 0,
535 .cra_type = &crypto_ablkcipher_type,
536 .cra_module = THIS_MODULE,
537 .cra_init = ablk_init,
538 .cra_exit = ablk_exit,
539 .cra_u = {
540 .ablkcipher = {
541 .min_keysize = CAST6_MIN_KEY_SIZE +
542 CAST6_BLOCK_SIZE,
543 .max_keysize = CAST6_MAX_KEY_SIZE +
544 CAST6_BLOCK_SIZE,
545 .ivsize = CAST6_BLOCK_SIZE,
546 .setkey = ablk_set_key,
547 .encrypt = ablk_encrypt,
548 .decrypt = ablk_decrypt,
549 },
550 },
551}, {
552 .cra_name = "xts(cast6)",
553 .cra_driver_name = "xts-cast6-avx",
554 .cra_priority = 200,
555 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
556 .cra_blocksize = CAST6_BLOCK_SIZE,
557 .cra_ctxsize = sizeof(struct async_helper_ctx),
558 .cra_alignmask = 0,
559 .cra_type = &crypto_ablkcipher_type,
560 .cra_module = THIS_MODULE,
561 .cra_init = ablk_init,
562 .cra_exit = ablk_exit,
563 .cra_u = {
564 .ablkcipher = {
565 .min_keysize = CAST6_MIN_KEY_SIZE * 2,
566 .max_keysize = CAST6_MAX_KEY_SIZE * 2,
567 .ivsize = CAST6_BLOCK_SIZE,
568 .setkey = ablk_set_key,
569 .encrypt = ablk_encrypt,
570 .decrypt = ablk_decrypt,
571 },
572 },
573} };
574
575static int __init cast6_init(void)
576{
577 u64 xcr0;
578
579 if (!cpu_has_avx || !cpu_has_osxsave) {
580 pr_info("AVX instructions are not detected.\n");
581 return -ENODEV;
582 }
583
584 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
585 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
586 pr_info("AVX detected but unusable.\n");
587 return -ENODEV;
588 }
589
590 return crypto_register_algs(cast6_algs, ARRAY_SIZE(cast6_algs));
591}
592
593static void __exit cast6_exit(void)
594{
595 crypto_unregister_algs(cast6_algs, ARRAY_SIZE(cast6_algs));
596}
597
598module_init(cast6_init);
599module_exit(cast6_exit);
600
601MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
602MODULE_LICENSE("GPL");
603MODULE_ALIAS("cast6");
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
deleted file mode 100644
index 6812ad98355..00000000000
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ /dev/null
@@ -1,284 +0,0 @@
1/*
2 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
3 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
4 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
5 * http://www.intel.com/products/processor/manuals/
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M
8 *
9 * Copyright (C) 2008 Intel Corporation
10 * Authors: Austin Zhang <austin_zhang@linux.intel.com>
11 * Kent Liu <kent.liu@intel.com>
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms and conditions of the GNU General Public License,
15 * version 2, as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 * more details.
21 *
22 * You should have received a copy of the GNU General Public License along with
23 * this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
25 *
26 */
27#include <linux/init.h>
28#include <linux/module.h>
29#include <linux/string.h>
30#include <linux/kernel.h>
31#include <crypto/internal/hash.h>
32
33#include <asm/cpufeature.h>
34#include <asm/cpu_device_id.h>
35#include <asm/i387.h>
36#include <asm/fpu-internal.h>
37
38#define CHKSUM_BLOCK_SIZE 1
39#define CHKSUM_DIGEST_SIZE 4
40
41#define SCALE_F sizeof(unsigned long)
42
43#ifdef CONFIG_X86_64
44#define REX_PRE "0x48, "
45#else
46#define REX_PRE
47#endif
48
49#ifdef CONFIG_X86_64
50/*
51 * use carryless multiply version of crc32c when buffer
52 * size is >= 512 (when eager fpu is enabled) or
53 * >= 1024 (when eager fpu is disabled) to account
54 * for fpu state save/restore overhead.
55 */
56#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
57#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
58
59asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
60 unsigned int crc_init);
61static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
62#if defined(X86_FEATURE_EAGER_FPU)
63#define set_pcl_breakeven_point() \
64do { \
65 if (!use_eager_fpu()) \
66 crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
67} while (0)
68#else
69#define set_pcl_breakeven_point() \
70 (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
71#endif
72#endif /* CONFIG_X86_64 */
73
74static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
75{
76 while (length--) {
77 __asm__ __volatile__(
78 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
79 :"=S"(crc)
80 :"0"(crc), "c"(*data)
81 );
82 data++;
83 }
84
85 return crc;
86}
87
88static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
89{
90 unsigned int iquotient = len / SCALE_F;
91 unsigned int iremainder = len % SCALE_F;
92 unsigned long *ptmp = (unsigned long *)p;
93
94 while (iquotient--) {
95 __asm__ __volatile__(
96 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
97 :"=S"(crc)
98 :"0"(crc), "c"(*ptmp)
99 );
100 ptmp++;
101 }
102
103 if (iremainder)
104 crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
105 iremainder);
106
107 return crc;
108}
109
110/*
111 * Setting the seed allows arbitrary accumulators and flexible XOR policy
112 * If your algorithm starts with ~0, then XOR with ~0 before you set
113 * the seed.
114 */
115static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
116 unsigned int keylen)
117{
118 u32 *mctx = crypto_shash_ctx(hash);
119
120 if (keylen != sizeof(u32)) {
121 crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
122 return -EINVAL;
123 }
124 *mctx = le32_to_cpup((__le32 *)key);
125 return 0;
126}
127
128static int crc32c_intel_init(struct shash_desc *desc)
129{
130 u32 *mctx = crypto_shash_ctx(desc->tfm);
131 u32 *crcp = shash_desc_ctx(desc);
132
133 *crcp = *mctx;
134
135 return 0;
136}
137
138static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
139 unsigned int len)
140{
141 u32 *crcp = shash_desc_ctx(desc);
142
143 *crcp = crc32c_intel_le_hw(*crcp, data, len);
144 return 0;
145}
146
147static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
148 u8 *out)
149{
150 *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
151 return 0;
152}
153
154static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
155 unsigned int len, u8 *out)
156{
157 return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
158}
159
160static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
161{
162 u32 *crcp = shash_desc_ctx(desc);
163
164 *(__le32 *)out = ~cpu_to_le32p(crcp);
165 return 0;
166}
167
168static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
169 unsigned int len, u8 *out)
170{
171 return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
172 out);
173}
174
175static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
176{
177 u32 *key = crypto_tfm_ctx(tfm);
178
179 *key = ~0;
180
181 return 0;
182}
183
184#ifdef CONFIG_X86_64
185static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
186 unsigned int len)
187{
188 u32 *crcp = shash_desc_ctx(desc);
189
190 /*
191 * use faster PCL version if datasize is large enough to
192 * overcome kernel fpu state save/restore overhead
193 */
194 if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
195 kernel_fpu_begin();
196 *crcp = crc_pcl(data, len, *crcp);
197 kernel_fpu_end();
198 } else
199 *crcp = crc32c_intel_le_hw(*crcp, data, len);
200 return 0;
201}
202
203static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
204 u8 *out)
205{
206 if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
207 kernel_fpu_begin();
208 *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
209 kernel_fpu_end();
210 } else
211 *(__le32 *)out =
212 ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
213 return 0;
214}
215
216static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
217 unsigned int len, u8 *out)
218{
219 return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
220}
221
222static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
223 unsigned int len, u8 *out)
224{
225 return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
226 out);
227}
228#endif /* CONFIG_X86_64 */
229
230static struct shash_alg alg = {
231 .setkey = crc32c_intel_setkey,
232 .init = crc32c_intel_init,
233 .update = crc32c_intel_update,
234 .final = crc32c_intel_final,
235 .finup = crc32c_intel_finup,
236 .digest = crc32c_intel_digest,
237 .descsize = sizeof(u32),
238 .digestsize = CHKSUM_DIGEST_SIZE,
239 .base = {
240 .cra_name = "crc32c",
241 .cra_driver_name = "crc32c-intel",
242 .cra_priority = 200,
243 .cra_blocksize = CHKSUM_BLOCK_SIZE,
244 .cra_ctxsize = sizeof(u32),
245 .cra_module = THIS_MODULE,
246 .cra_init = crc32c_intel_cra_init,
247 }
248};
249
250static const struct x86_cpu_id crc32c_cpu_id[] = {
251 X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
252 {}
253};
254MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
255
256static int __init crc32c_intel_mod_init(void)
257{
258 if (!x86_match_cpu(crc32c_cpu_id))
259 return -ENODEV;
260#ifdef CONFIG_X86_64
261 if (cpu_has_pclmulqdq) {
262 alg.update = crc32c_pcl_intel_update;
263 alg.finup = crc32c_pcl_intel_finup;
264 alg.digest = crc32c_pcl_intel_digest;
265 set_pcl_breakeven_point();
266 }
267#endif
268 return crypto_register_shash(&alg);
269}
270
271static void __exit crc32c_intel_mod_fini(void)
272{
273 crypto_unregister_shash(&alg);
274}
275
276module_init(crc32c_intel_mod_init);
277module_exit(crc32c_intel_mod_fini);
278
279MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
280MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
281MODULE_LICENSE("GPL");
282
283MODULE_ALIAS("crc32c");
284MODULE_ALIAS("crc32c-intel");
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
deleted file mode 100644
index 93c6d39237a..00000000000
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ /dev/null
@@ -1,460 +0,0 @@
1/*
2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
3 *
4 * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
5 * downloaded from:
6 * http://download.intel.com/design/intarch/papers/323405.pdf
7 *
8 * Copyright (C) 2012 Intel Corporation.
9 *
10 * Authors:
11 * Wajdi Feghali <wajdi.k.feghali@intel.com>
12 * James Guilford <james.guilford@intel.com>
13 * David Cote <david.m.cote@intel.com>
14 * Tim Chen <tim.c.chen@linux.intel.com>
15 *
16 * This software is available to you under a choice of one of two
17 * licenses. You may choose to be licensed under the terms of the GNU
18 * General Public License (GPL) Version 2, available from the file
19 * COPYING in the main directory of this source tree, or the
20 * OpenIB.org BSD license below:
21 *
22 * Redistribution and use in source and binary forms, with or
23 * without modification, are permitted provided that the following
24 * conditions are met:
25 *
26 * - Redistributions of source code must retain the above
27 * copyright notice, this list of conditions and the following
28 * disclaimer.
29 *
30 * - Redistributions in binary form must reproduce the above
31 * copyright notice, this list of conditions and the following
32 * disclaimer in the documentation and/or other materials
33 * provided with the distribution.
34 *
35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42 * SOFTWARE.
43 */
44
45## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
46
47.macro LABEL prefix n
48\prefix\n\():
49.endm
50
51.macro JMPTBL_ENTRY i
52.word crc_\i - crc_array
53.endm
54
55.macro JNC_LESS_THAN j
56 jnc less_than_\j
57.endm
58
59# Define threshold where buffers are considered "small" and routed to more
60# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
61# SMALL_SIZE can be no larger than 255.
62
63#define SMALL_SIZE 200
64
65.if (SMALL_SIZE > 255)
66.error "SMALL_ SIZE must be < 256"
67.endif
68
69# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
70
71.global crc_pcl
72crc_pcl:
73#define bufp %rdi
74#define bufp_dw %edi
75#define bufp_w %di
76#define bufp_b %dil
77#define bufptmp %rcx
78#define block_0 %rcx
79#define block_1 %rdx
80#define block_2 %r11
81#define len %rsi
82#define len_dw %esi
83#define len_w %si
84#define len_b %sil
85#define crc_init_arg %rdx
86#define tmp %rbx
87#define crc_init %r8
88#define crc_init_dw %r8d
89#define crc1 %r9
90#define crc2 %r10
91
92 pushq %rbx
93 pushq %rdi
94 pushq %rsi
95
96 ## Move crc_init for Linux to a different
97 mov crc_init_arg, crc_init
98
99 ################################################################
100 ## 1) ALIGN:
101 ################################################################
102
103 mov bufp, bufptmp # rdi = *buf
104 neg bufp
105 and $7, bufp # calculate the unalignment amount of
106 # the address
107 je proc_block # Skip if aligned
108
109 ## If len is less than 8 and we're unaligned, we need to jump
110 ## to special code to avoid reading beyond the end of the buffer
111 cmp $8, len
112 jae do_align
113 # less_than_8 expects length in upper 3 bits of len_dw
114 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
115 shl $32-3+1, len_dw
116 jmp less_than_8_post_shl1
117
118do_align:
119 #### Calculate CRC of unaligned bytes of the buffer (if any)
120 movq (bufptmp), tmp # load a quadward from the buffer
121 add bufp, bufptmp # align buffer pointer for quadword
122 # processing
123 sub bufp, len # update buffer length
124align_loop:
125 crc32b %bl, crc_init_dw # compute crc32 of 1-byte
126 shr $8, tmp # get next byte
127 dec bufp
128 jne align_loop
129
130proc_block:
131
132 ################################################################
133 ## 2) PROCESS BLOCKS:
134 ################################################################
135
136 ## compute num of bytes to be processed
137 movq len, tmp # save num bytes in tmp
138
139 cmpq $128*24, len
140 jae full_block
141
142continue_block:
143 cmpq $SMALL_SIZE, len
144 jb small
145
146 ## len < 128*24
147 movq $2731, %rax # 2731 = ceil(2^16 / 24)
148 mul len_dw
149 shrq $16, %rax
150
151 ## eax contains floor(bytes / 24) = num 24-byte chunks to do
152
153 ## process rax 24-byte chunks (128 >= rax >= 0)
154
155 ## compute end address of each block
156 ## block 0 (base addr + RAX * 8)
157 ## block 1 (base addr + RAX * 16)
158 ## block 2 (base addr + RAX * 24)
159 lea (bufptmp, %rax, 8), block_0
160 lea (block_0, %rax, 8), block_1
161 lea (block_1, %rax, 8), block_2
162
163 xor crc1, crc1
164 xor crc2, crc2
165
166 ## branch into array
167 lea jump_table(%rip), bufp
168 movzxw (bufp, %rax, 2), len
169 offset=crc_array-jump_table
170 lea offset(bufp, len, 1), bufp
171 jmp *bufp
172
173 ################################################################
174 ## 2a) PROCESS FULL BLOCKS:
175 ################################################################
176full_block:
177 movq $128,%rax
178 lea 128*8*2(block_0), block_1
179 lea 128*8*3(block_0), block_2
180 add $128*8*1, block_0
181
182 xor crc1,crc1
183 xor crc2,crc2
184
185 # Fall thruogh into top of crc array (crc_128)
186
187 ################################################################
188 ## 3) CRC Array:
189 ################################################################
190
191crc_array:
192 i=128
193.rept 128-1
194.altmacro
195LABEL crc_ %i
196.noaltmacro
197 crc32q -i*8(block_0), crc_init
198 crc32q -i*8(block_1), crc1
199 crc32q -i*8(block_2), crc2
200 i=(i-1)
201.endr
202
203.altmacro
204LABEL crc_ %i
205.noaltmacro
206 crc32q -i*8(block_0), crc_init
207 crc32q -i*8(block_1), crc1
208# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
209
210 mov block_2, block_0
211
212 ################################################################
213 ## 4) Combine three results:
214 ################################################################
215
216 lea (K_table-16)(%rip), bufp # first entry is for idx 1
217 shlq $3, %rax # rax *= 8
218 subq %rax, tmp # tmp -= rax*8
219 shlq $1, %rax
220 subq %rax, tmp # tmp -= rax*16
221 # (total tmp -= rax*24)
222 addq %rax, bufp
223
224 movdqa (bufp), %xmm0 # 2 consts: K1:K2
225
226 movq crc_init, %xmm1 # CRC for block 1
227 pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2
228
229 movq crc1, %xmm2 # CRC for block 2
230 pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
231
232 pxor %xmm2,%xmm1
233 movq %xmm1, %rax
234 xor -i*8(block_2), %rax
235 mov crc2, crc_init
236 crc32 %rax, crc_init
237
238################################################################
239## 5) Check for end:
240################################################################
241
242LABEL crc_ 0
243 mov tmp, len
244 cmp $128*24, tmp
245 jae full_block
246 cmp $24, tmp
247 jae continue_block
248
249less_than_24:
250 shl $32-4, len_dw # less_than_16 expects length
251 # in upper 4 bits of len_dw
252 jnc less_than_16
253 crc32q (bufptmp), crc_init
254 crc32q 8(bufptmp), crc_init
255 jz do_return
256 add $16, bufptmp
257 # len is less than 8 if we got here
258 # less_than_8 expects length in upper 3 bits of len_dw
259 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
260 shl $2, len_dw
261 jmp less_than_8_post_shl1
262
263 #######################################################################
264 ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
265 #######################################################################
266small:
267 shl $32-8, len_dw # Prepare len_dw for less_than_256
268 j=256
269.rept 5 # j = {256, 128, 64, 32, 16}
270.altmacro
271LABEL less_than_ %j # less_than_j: Length should be in
272 # upper lg(j) bits of len_dw
273 j=(j/2)
274 shl $1, len_dw # Get next MSB
275 JNC_LESS_THAN %j
276.noaltmacro
277 i=0
278.rept (j/8)
279 crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
280 i=i+8
281.endr
282 jz do_return # Return if remaining length is zero
283 add $j, bufptmp # Advance buf
284.endr
285
286less_than_8: # Length should be stored in
287 # upper 3 bits of len_dw
288 shl $1, len_dw
289less_than_8_post_shl1:
290 jnc less_than_4
291 crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
292 jz do_return # return if remaining data is zero
293 add $4, bufptmp
294less_than_4: # Length should be stored in
295 # upper 2 bits of len_dw
296 shl $1, len_dw
297 jnc less_than_2
298 crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
299 jz do_return # return if remaining data is zero
300 add $2, bufptmp
301less_than_2: # Length should be stored in the MSB
302 # of len_dw
303 shl $1, len_dw
304 jnc less_than_1
305 crc32b (bufptmp), crc_init_dw # CRC of 1 byte
306less_than_1: # Length should be zero
307do_return:
308 movq crc_init, %rax
309 popq %rsi
310 popq %rdi
311 popq %rbx
312 ret
313
314 ################################################################
315 ## jump table Table is 129 entries x 2 bytes each
316 ################################################################
317.align 4
318jump_table:
319 i=0
320.rept 129
321.altmacro
322JMPTBL_ENTRY %i
323.noaltmacro
324 i=i+1
325.endr
326 ################################################################
327 ## PCLMULQDQ tables
328 ## Table is 128 entries x 2 quad words each
329 ################################################################
330.data
331.align 64
332K_table:
333 .quad 0x14cd00bd6,0x105ec76f0
334 .quad 0x0ba4fc28e,0x14cd00bd6
335 .quad 0x1d82c63da,0x0f20c0dfe
336 .quad 0x09e4addf8,0x0ba4fc28e
337 .quad 0x039d3b296,0x1384aa63a
338 .quad 0x102f9b8a2,0x1d82c63da
339 .quad 0x14237f5e6,0x01c291d04
340 .quad 0x00d3b6092,0x09e4addf8
341 .quad 0x0c96cfdc0,0x0740eef02
342 .quad 0x18266e456,0x039d3b296
343 .quad 0x0daece73e,0x0083a6eec
344 .quad 0x0ab7aff2a,0x102f9b8a2
345 .quad 0x1248ea574,0x1c1733996
346 .quad 0x083348832,0x14237f5e6
347 .quad 0x12c743124,0x02ad91c30
348 .quad 0x0b9e02b86,0x00d3b6092
349 .quad 0x018b33a4e,0x06992cea2
350 .quad 0x1b331e26a,0x0c96cfdc0
351 .quad 0x17d35ba46,0x07e908048
352 .quad 0x1bf2e8b8a,0x18266e456
353 .quad 0x1a3e0968a,0x11ed1f9d8
354 .quad 0x0ce7f39f4,0x0daece73e
355 .quad 0x061d82e56,0x0f1d0f55e
356 .quad 0x0d270f1a2,0x0ab7aff2a
357 .quad 0x1c3f5f66c,0x0a87ab8a8
358 .quad 0x12ed0daac,0x1248ea574
359 .quad 0x065863b64,0x08462d800
360 .quad 0x11eef4f8e,0x083348832
361 .quad 0x1ee54f54c,0x071d111a8
362 .quad 0x0b3e32c28,0x12c743124
363 .quad 0x0064f7f26,0x0ffd852c6
364 .quad 0x0dd7e3b0c,0x0b9e02b86
365 .quad 0x0f285651c,0x0dcb17aa4
366 .quad 0x010746f3c,0x018b33a4e
367 .quad 0x1c24afea4,0x0f37c5aee
368 .quad 0x0271d9844,0x1b331e26a
369 .quad 0x08e766a0c,0x06051d5a2
370 .quad 0x093a5f730,0x17d35ba46
371 .quad 0x06cb08e5c,0x11d5ca20e
372 .quad 0x06b749fb2,0x1bf2e8b8a
373 .quad 0x1167f94f2,0x021f3d99c
374 .quad 0x0cec3662e,0x1a3e0968a
375 .quad 0x19329634a,0x08f158014
376 .quad 0x0e6fc4e6a,0x0ce7f39f4
377 .quad 0x08227bb8a,0x1a5e82106
378 .quad 0x0b0cd4768,0x061d82e56
379 .quad 0x13c2b89c4,0x188815ab2
380 .quad 0x0d7a4825c,0x0d270f1a2
381 .quad 0x10f5ff2ba,0x105405f3e
382 .quad 0x00167d312,0x1c3f5f66c
383 .quad 0x0f6076544,0x0e9adf796
384 .quad 0x026f6a60a,0x12ed0daac
385 .quad 0x1a2adb74e,0x096638b34
386 .quad 0x19d34af3a,0x065863b64
387 .quad 0x049c3cc9c,0x1e50585a0
388 .quad 0x068bce87a,0x11eef4f8e
389 .quad 0x1524fa6c6,0x19f1c69dc
390 .quad 0x16cba8aca,0x1ee54f54c
391 .quad 0x042d98888,0x12913343e
392 .quad 0x1329d9f7e,0x0b3e32c28
393 .quad 0x1b1c69528,0x088f25a3a
394 .quad 0x02178513a,0x0064f7f26
395 .quad 0x0e0ac139e,0x04e36f0b0
396 .quad 0x0170076fa,0x0dd7e3b0c
397 .quad 0x141a1a2e2,0x0bd6f81f8
398 .quad 0x16ad828b4,0x0f285651c
399 .quad 0x041d17b64,0x19425cbba
400 .quad 0x1fae1cc66,0x010746f3c
401 .quad 0x1a75b4b00,0x18db37e8a
402 .quad 0x0f872e54c,0x1c24afea4
403 .quad 0x01e41e9fc,0x04c144932
404 .quad 0x086d8e4d2,0x0271d9844
405 .quad 0x160f7af7a,0x052148f02
406 .quad 0x05bb8f1bc,0x08e766a0c
407 .quad 0x0a90fd27a,0x0a3c6f37a
408 .quad 0x0b3af077a,0x093a5f730
409 .quad 0x04984d782,0x1d22c238e
410 .quad 0x0ca6ef3ac,0x06cb08e5c
411 .quad 0x0234e0b26,0x063ded06a
412 .quad 0x1d88abd4a,0x06b749fb2
413 .quad 0x04597456a,0x04d56973c
414 .quad 0x0e9e28eb4,0x1167f94f2
415 .quad 0x07b3ff57a,0x19385bf2e
416 .quad 0x0c9c8b782,0x0cec3662e
417 .quad 0x13a9cba9e,0x0e417f38a
418 .quad 0x093e106a4,0x19329634a
419 .quad 0x167001a9c,0x14e727980
420 .quad 0x1ddffc5d4,0x0e6fc4e6a
421 .quad 0x00df04680,0x0d104b8fc
422 .quad 0x02342001e,0x08227bb8a
423 .quad 0x00a2a8d7e,0x05b397730
424 .quad 0x168763fa6,0x0b0cd4768
425 .quad 0x1ed5a407a,0x0e78eb416
426 .quad 0x0d2c3ed1a,0x13c2b89c4
427 .quad 0x0995a5724,0x1641378f0
428 .quad 0x19b1afbc4,0x0d7a4825c
429 .quad 0x109ffedc0,0x08d96551c
430 .quad 0x0f2271e60,0x10f5ff2ba
431 .quad 0x00b0bf8ca,0x00bf80dd2
432 .quad 0x123888b7a,0x00167d312
433 .quad 0x1e888f7dc,0x18dcddd1c
434 .quad 0x002ee03b2,0x0f6076544
435 .quad 0x183e8d8fe,0x06a45d2b2
436 .quad 0x133d7a042,0x026f6a60a
437 .quad 0x116b0f50c,0x1dd3e10e8
438 .quad 0x05fabe670,0x1a2adb74e
439 .quad 0x130004488,0x0de87806c
440 .quad 0x000bcf5f6,0x19d34af3a
441 .quad 0x18f0c7078,0x014338754
442 .quad 0x017f27698,0x049c3cc9c
443 .quad 0x058ca5f00,0x15e3e77ee
444 .quad 0x1af900c24,0x068bce87a
445 .quad 0x0b5cfca28,0x0dd07448e
446 .quad 0x0ded288f8,0x1524fa6c6
447 .quad 0x059f229bc,0x1d8048348
448 .quad 0x06d390dec,0x16cba8aca
449 .quad 0x037170390,0x0a3e3e02c
450 .quad 0x06353c1cc,0x042d98888
451 .quad 0x0c4584f5c,0x0d73c7bea
452 .quad 0x1f16a3418,0x1329d9f7e
453 .quad 0x0531377e2,0x185137662
454 .quad 0x1d8d9ca7c,0x1b1c69528
455 .quad 0x0b25b29f2,0x18a08b5bc
456 .quad 0x19fb2a8b0,0x02178513a
457 .quad 0x1a08fe6ac,0x1da758ae0
458 .quad 0x045cddf4e,0x0e0ac139e
459 .quad 0x1a91647f2,0x169cf9eb0
460 .quad 0x1a0f717c4,0x0170076fa
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 6759dd1135b..976aa64d9a2 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -20,7 +20,6 @@
20#include <crypto/gf128mul.h> 20#include <crypto/gf128mul.h>
21#include <crypto/internal/hash.h> 21#include <crypto/internal/hash.h>
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/cpu_device_id.h>
24 23
25#define GHASH_BLOCK_SIZE 16 24#define GHASH_BLOCK_SIZE 16
26#define GHASH_DIGEST_SIZE 16 25#define GHASH_DIGEST_SIZE 16
@@ -150,6 +149,7 @@ static struct shash_alg ghash_alg = {
150 .cra_blocksize = GHASH_BLOCK_SIZE, 149 .cra_blocksize = GHASH_BLOCK_SIZE,
151 .cra_ctxsize = sizeof(struct ghash_ctx), 150 .cra_ctxsize = sizeof(struct ghash_ctx),
152 .cra_module = THIS_MODULE, 151 .cra_module = THIS_MODULE,
152 .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
153 }, 153 },
154}; 154};
155 155
@@ -287,24 +287,22 @@ static struct ahash_alg ghash_async_alg = {
287 .cra_blocksize = GHASH_BLOCK_SIZE, 287 .cra_blocksize = GHASH_BLOCK_SIZE,
288 .cra_type = &crypto_ahash_type, 288 .cra_type = &crypto_ahash_type,
289 .cra_module = THIS_MODULE, 289 .cra_module = THIS_MODULE,
290 .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
290 .cra_init = ghash_async_init_tfm, 291 .cra_init = ghash_async_init_tfm,
291 .cra_exit = ghash_async_exit_tfm, 292 .cra_exit = ghash_async_exit_tfm,
292 }, 293 },
293 }, 294 },
294}; 295};
295 296
296static const struct x86_cpu_id pcmul_cpu_id[] = {
297 X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */
298 {}
299};
300MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
301
302static int __init ghash_pclmulqdqni_mod_init(void) 297static int __init ghash_pclmulqdqni_mod_init(void)
303{ 298{
304 int err; 299 int err;
305 300
306 if (!x86_match_cpu(pcmul_cpu_id)) 301 if (!cpu_has_pclmulqdq) {
302 printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
303 " detected.\n");
307 return -ENODEV; 304 return -ENODEV;
305 }
308 306
309 err = crypto_register_shash(&ghash_alg); 307 err = crypto_register_shash(&ghash_alg);
310 if (err) 308 if (err)
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
deleted file mode 100644
index f7b6ea2ddfd..00000000000
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ /dev/null
@@ -1,91 +0,0 @@
1/*
2 * Shared glue code for 128bit block ciphers, AVX assembler macros
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
19 vmovdqu (0*16)(src), x0; \
20 vmovdqu (1*16)(src), x1; \
21 vmovdqu (2*16)(src), x2; \
22 vmovdqu (3*16)(src), x3; \
23 vmovdqu (4*16)(src), x4; \
24 vmovdqu (5*16)(src), x5; \
25 vmovdqu (6*16)(src), x6; \
26 vmovdqu (7*16)(src), x7;
27
28#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
29 vmovdqu x0, (0*16)(dst); \
30 vmovdqu x1, (1*16)(dst); \
31 vmovdqu x2, (2*16)(dst); \
32 vmovdqu x3, (3*16)(dst); \
33 vmovdqu x4, (4*16)(dst); \
34 vmovdqu x5, (5*16)(dst); \
35 vmovdqu x6, (6*16)(dst); \
36 vmovdqu x7, (7*16)(dst);
37
38#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
39 vpxor (0*16)(src), x1, x1; \
40 vpxor (1*16)(src), x2, x2; \
41 vpxor (2*16)(src), x3, x3; \
42 vpxor (3*16)(src), x4, x4; \
43 vpxor (4*16)(src), x5, x5; \
44 vpxor (5*16)(src), x6, x6; \
45 vpxor (6*16)(src), x7, x7; \
46 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
47
48#define inc_le128(x, minus_one, tmp) \
49 vpcmpeqq minus_one, x, tmp; \
50 vpsubq minus_one, x, x; \
51 vpslldq $8, tmp, tmp; \
52 vpsubq tmp, x, x;
53
54#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
55 vpcmpeqd t0, t0, t0; \
56 vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
57 vmovdqa bswap, t1; \
58 \
59 /* load IV and byteswap */ \
60 vmovdqu (iv), x7; \
61 vpshufb t1, x7, x0; \
62 \
63 /* construct IVs */ \
64 inc_le128(x7, t0, t2); \
65 vpshufb t1, x7, x1; \
66 inc_le128(x7, t0, t2); \
67 vpshufb t1, x7, x2; \
68 inc_le128(x7, t0, t2); \
69 vpshufb t1, x7, x3; \
70 inc_le128(x7, t0, t2); \
71 vpshufb t1, x7, x4; \
72 inc_le128(x7, t0, t2); \
73 vpshufb t1, x7, x5; \
74 inc_le128(x7, t0, t2); \
75 vpshufb t1, x7, x6; \
76 inc_le128(x7, t0, t2); \
77 vmovdqa x7, t2; \
78 vpshufb t1, x7, x7; \
79 inc_le128(t2, t0, t1); \
80 vmovdqu t2, (iv);
81
82#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
83 vpxor (0*16)(src), x0, x0; \
84 vpxor (1*16)(src), x1, x1; \
85 vpxor (2*16)(src), x2, x2; \
86 vpxor (3*16)(src), x3, x3; \
87 vpxor (4*16)(src), x4, x4; \
88 vpxor (5*16)(src), x5, x5; \
89 vpxor (6*16)(src), x6, x6; \
90 vpxor (7*16)(src), x7, x7; \
91 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
deleted file mode 100644
index 22ce4f683e5..00000000000
--- a/arch/x86/crypto/glue_helper.c
+++ /dev/null
@@ -1,307 +0,0 @@
1/*
2 * Shared glue code for 128bit block ciphers
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
8 * CTR part based on code (crypto/ctr.c) by:
9 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
24 * USA
25 *
26 */
27
28#include <linux/module.h>
29#include <crypto/b128ops.h>
30#include <crypto/lrw.h>
31#include <crypto/xts.h>
32#include <asm/crypto/glue_helper.h>
33#include <crypto/scatterwalk.h>
34
35static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
36 struct blkcipher_desc *desc,
37 struct blkcipher_walk *walk)
38{
39 void *ctx = crypto_blkcipher_ctx(desc->tfm);
40 const unsigned int bsize = 128 / 8;
41 unsigned int nbytes, i, func_bytes;
42 bool fpu_enabled = false;
43 int err;
44
45 err = blkcipher_walk_virt(desc, walk);
46
47 while ((nbytes = walk->nbytes)) {
48 u8 *wsrc = walk->src.virt.addr;
49 u8 *wdst = walk->dst.virt.addr;
50
51 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
52 desc, fpu_enabled, nbytes);
53
54 for (i = 0; i < gctx->num_funcs; i++) {
55 func_bytes = bsize * gctx->funcs[i].num_blocks;
56
57 /* Process multi-block batch */
58 if (nbytes >= func_bytes) {
59 do {
60 gctx->funcs[i].fn_u.ecb(ctx, wdst,
61 wsrc);
62
63 wsrc += func_bytes;
64 wdst += func_bytes;
65 nbytes -= func_bytes;
66 } while (nbytes >= func_bytes);
67
68 if (nbytes < bsize)
69 goto done;
70 }
71 }
72
73done:
74 err = blkcipher_walk_done(desc, walk, nbytes);
75 }
76
77 glue_fpu_end(fpu_enabled);
78 return err;
79}
80
81int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
82 struct blkcipher_desc *desc, struct scatterlist *dst,
83 struct scatterlist *src, unsigned int nbytes)
84{
85 struct blkcipher_walk walk;
86
87 blkcipher_walk_init(&walk, dst, src, nbytes);
88 return __glue_ecb_crypt_128bit(gctx, desc, &walk);
89}
90EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit);
91
92static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn,
93 struct blkcipher_desc *desc,
94 struct blkcipher_walk *walk)
95{
96 void *ctx = crypto_blkcipher_ctx(desc->tfm);
97 const unsigned int bsize = 128 / 8;
98 unsigned int nbytes = walk->nbytes;
99 u128 *src = (u128 *)walk->src.virt.addr;
100 u128 *dst = (u128 *)walk->dst.virt.addr;
101 u128 *iv = (u128 *)walk->iv;
102
103 do {
104 u128_xor(dst, src, iv);
105 fn(ctx, (u8 *)dst, (u8 *)dst);
106 iv = dst;
107
108 src += 1;
109 dst += 1;
110 nbytes -= bsize;
111 } while (nbytes >= bsize);
112
113 *(u128 *)walk->iv = *iv;
114 return nbytes;
115}
116
117int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
118 struct blkcipher_desc *desc,
119 struct scatterlist *dst,
120 struct scatterlist *src, unsigned int nbytes)
121{
122 struct blkcipher_walk walk;
123 int err;
124
125 blkcipher_walk_init(&walk, dst, src, nbytes);
126 err = blkcipher_walk_virt(desc, &walk);
127
128 while ((nbytes = walk.nbytes)) {
129 nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk);
130 err = blkcipher_walk_done(desc, &walk, nbytes);
131 }
132
133 return err;
134}
135EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit);
136
137static unsigned int
138__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
139 struct blkcipher_desc *desc,
140 struct blkcipher_walk *walk)
141{
142 void *ctx = crypto_blkcipher_ctx(desc->tfm);
143 const unsigned int bsize = 128 / 8;
144 unsigned int nbytes = walk->nbytes;
145 u128 *src = (u128 *)walk->src.virt.addr;
146 u128 *dst = (u128 *)walk->dst.virt.addr;
147 u128 last_iv;
148 unsigned int num_blocks, func_bytes;
149 unsigned int i;
150
151 /* Start of the last block. */
152 src += nbytes / bsize - 1;
153 dst += nbytes / bsize - 1;
154
155 last_iv = *src;
156
157 for (i = 0; i < gctx->num_funcs; i++) {
158 num_blocks = gctx->funcs[i].num_blocks;
159 func_bytes = bsize * num_blocks;
160
161 /* Process multi-block batch */
162 if (nbytes >= func_bytes) {
163 do {
164 nbytes -= func_bytes - bsize;
165 src -= num_blocks - 1;
166 dst -= num_blocks - 1;
167
168 gctx->funcs[i].fn_u.cbc(ctx, dst, src);
169
170 nbytes -= bsize;
171 if (nbytes < bsize)
172 goto done;
173
174 u128_xor(dst, dst, src - 1);
175 src -= 1;
176 dst -= 1;
177 } while (nbytes >= func_bytes);
178
179 if (nbytes < bsize)
180 goto done;
181 }
182 }
183
184done:
185 u128_xor(dst, dst, (u128 *)walk->iv);
186 *(u128 *)walk->iv = last_iv;
187
188 return nbytes;
189}
190
191int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
192 struct blkcipher_desc *desc,
193 struct scatterlist *dst,
194 struct scatterlist *src, unsigned int nbytes)
195{
196 const unsigned int bsize = 128 / 8;
197 bool fpu_enabled = false;
198 struct blkcipher_walk walk;
199 int err;
200
201 blkcipher_walk_init(&walk, dst, src, nbytes);
202 err = blkcipher_walk_virt(desc, &walk);
203
204 while ((nbytes = walk.nbytes)) {
205 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
206 desc, fpu_enabled, nbytes);
207 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
208 err = blkcipher_walk_done(desc, &walk, nbytes);
209 }
210
211 glue_fpu_end(fpu_enabled);
212 return err;
213}
214EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
215
216static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
217 struct blkcipher_desc *desc,
218 struct blkcipher_walk *walk)
219{
220 void *ctx = crypto_blkcipher_ctx(desc->tfm);
221 u8 *src = (u8 *)walk->src.virt.addr;
222 u8 *dst = (u8 *)walk->dst.virt.addr;
223 unsigned int nbytes = walk->nbytes;
224 le128 ctrblk;
225 u128 tmp;
226
227 be128_to_le128(&ctrblk, (be128 *)walk->iv);
228
229 memcpy(&tmp, src, nbytes);
230 fn_ctr(ctx, &tmp, &tmp, &ctrblk);
231 memcpy(dst, &tmp, nbytes);
232
233 le128_to_be128((be128 *)walk->iv, &ctrblk);
234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236
237static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
238 struct blkcipher_desc *desc,
239 struct blkcipher_walk *walk)
240{
241 const unsigned int bsize = 128 / 8;
242 void *ctx = crypto_blkcipher_ctx(desc->tfm);
243 unsigned int nbytes = walk->nbytes;
244 u128 *src = (u128 *)walk->src.virt.addr;
245 u128 *dst = (u128 *)walk->dst.virt.addr;
246 le128 ctrblk;
247 unsigned int num_blocks, func_bytes;
248 unsigned int i;
249
250 be128_to_le128(&ctrblk, (be128 *)walk->iv);
251
252 /* Process multi-block batch */
253 for (i = 0; i < gctx->num_funcs; i++) {
254 num_blocks = gctx->funcs[i].num_blocks;
255 func_bytes = bsize * num_blocks;
256
257 if (nbytes >= func_bytes) {
258 do {
259 gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
260
261 src += num_blocks;
262 dst += num_blocks;
263 nbytes -= func_bytes;
264 } while (nbytes >= func_bytes);
265
266 if (nbytes < bsize)
267 goto done;
268 }
269 }
270
271done:
272 le128_to_be128((be128 *)walk->iv, &ctrblk);
273 return nbytes;
274}
275
276int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
277 struct blkcipher_desc *desc, struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 const unsigned int bsize = 128 / 8;
281 bool fpu_enabled = false;
282 struct blkcipher_walk walk;
283 int err;
284
285 blkcipher_walk_init(&walk, dst, src, nbytes);
286 err = blkcipher_walk_virt_block(desc, &walk, bsize);
287
288 while ((nbytes = walk.nbytes) >= bsize) {
289 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
290 desc, fpu_enabled, nbytes);
291 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
292 err = blkcipher_walk_done(desc, &walk, nbytes);
293 }
294
295 glue_fpu_end(fpu_enabled);
296
297 if (walk.nbytes) {
298 glue_ctr_crypt_final_128bit(
299 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
300 err = blkcipher_walk_done(desc, &walk, 0);
301 }
302
303 return err;
304}
305EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
306
307MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index a3a3c0205c1..bccb76d8098 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -97,6 +97,7 @@ static struct crypto_alg alg = {
97 .cra_ctxsize = sizeof(struct salsa20_ctx), 97 .cra_ctxsize = sizeof(struct salsa20_ctx),
98 .cra_alignmask = 3, 98 .cra_alignmask = 3,
99 .cra_module = THIS_MODULE, 99 .cra_module = THIS_MODULE,
100 .cra_list = LIST_HEAD_INIT(alg.cra_list),
100 .cra_u = { 101 .cra_u = {
101 .blkcipher = { 102 .blkcipher = {
102 .setkey = setkey, 103 .setkey = setkey,
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
deleted file mode 100644
index 02b0e9fe997..00000000000
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ /dev/null
@@ -1,754 +0,0 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include "glue_helper-asm-avx.S"
28
29.file "serpent-avx-x86_64-asm_64.S"
30
31.data
32.align 16
33
34.Lbswap128_mask:
35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
36
37.text
38
39#define CTX %rdi
40
41/**********************************************************************
42 8-way AVX serpent
43 **********************************************************************/
44#define RA1 %xmm0
45#define RB1 %xmm1
46#define RC1 %xmm2
47#define RD1 %xmm3
48#define RE1 %xmm4
49
50#define tp %xmm5
51
52#define RA2 %xmm6
53#define RB2 %xmm7
54#define RC2 %xmm8
55#define RD2 %xmm9
56#define RE2 %xmm10
57
58#define RNOT %xmm11
59
60#define RK0 %xmm12
61#define RK1 %xmm13
62#define RK2 %xmm14
63#define RK3 %xmm15
64
65
66#define S0_1(x0, x1, x2, x3, x4) \
67 vpor x0, x3, tp; \
68 vpxor x3, x0, x0; \
69 vpxor x2, x3, x4; \
70 vpxor RNOT, x4, x4; \
71 vpxor x1, tp, x3; \
72 vpand x0, x1, x1; \
73 vpxor x4, x1, x1; \
74 vpxor x0, x2, x2;
75#define S0_2(x0, x1, x2, x3, x4) \
76 vpxor x3, x0, x0; \
77 vpor x0, x4, x4; \
78 vpxor x2, x0, x0; \
79 vpand x1, x2, x2; \
80 vpxor x2, x3, x3; \
81 vpxor RNOT, x1, x1; \
82 vpxor x4, x2, x2; \
83 vpxor x2, x1, x1;
84
85#define S1_1(x0, x1, x2, x3, x4) \
86 vpxor x0, x1, tp; \
87 vpxor x3, x0, x0; \
88 vpxor RNOT, x3, x3; \
89 vpand tp, x1, x4; \
90 vpor tp, x0, x0; \
91 vpxor x2, x3, x3; \
92 vpxor x3, x0, x0; \
93 vpxor x3, tp, x1;
94#define S1_2(x0, x1, x2, x3, x4) \
95 vpxor x4, x3, x3; \
96 vpor x4, x1, x1; \
97 vpxor x2, x4, x4; \
98 vpand x0, x2, x2; \
99 vpxor x1, x2, x2; \
100 vpor x0, x1, x1; \
101 vpxor RNOT, x0, x0; \
102 vpxor x2, x0, x0; \
103 vpxor x1, x4, x4;
104
105#define S2_1(x0, x1, x2, x3, x4) \
106 vpxor RNOT, x3, x3; \
107 vpxor x0, x1, x1; \
108 vpand x2, x0, tp; \
109 vpxor x3, tp, tp; \
110 vpor x0, x3, x3; \
111 vpxor x1, x2, x2; \
112 vpxor x1, x3, x3; \
113 vpand tp, x1, x1;
114#define S2_2(x0, x1, x2, x3, x4) \
115 vpxor x2, tp, tp; \
116 vpand x3, x2, x2; \
117 vpor x1, x3, x3; \
118 vpxor RNOT, tp, tp; \
119 vpxor tp, x3, x3; \
120 vpxor tp, x0, x4; \
121 vpxor x2, tp, x0; \
122 vpor x2, x1, x1;
123
124#define S3_1(x0, x1, x2, x3, x4) \
125 vpxor x3, x1, tp; \
126 vpor x0, x3, x3; \
127 vpand x0, x1, x4; \
128 vpxor x2, x0, x0; \
129 vpxor tp, x2, x2; \
130 vpand x3, tp, x1; \
131 vpxor x3, x2, x2; \
132 vpor x4, x0, x0; \
133 vpxor x3, x4, x4;
134#define S3_2(x0, x1, x2, x3, x4) \
135 vpxor x0, x1, x1; \
136 vpand x3, x0, x0; \
137 vpand x4, x3, x3; \
138 vpxor x2, x3, x3; \
139 vpor x1, x4, x4; \
140 vpand x1, x2, x2; \
141 vpxor x3, x4, x4; \
142 vpxor x3, x0, x0; \
143 vpxor x2, x3, x3;
144
145#define S4_1(x0, x1, x2, x3, x4) \
146 vpand x0, x3, tp; \
147 vpxor x3, x0, x0; \
148 vpxor x2, tp, tp; \
149 vpor x3, x2, x2; \
150 vpxor x1, x0, x0; \
151 vpxor tp, x3, x4; \
152 vpor x0, x2, x2; \
153 vpxor x1, x2, x2;
154#define S4_2(x0, x1, x2, x3, x4) \
155 vpand x0, x1, x1; \
156 vpxor x4, x1, x1; \
157 vpand x2, x4, x4; \
158 vpxor tp, x2, x2; \
159 vpxor x0, x4, x4; \
160 vpor x1, tp, x3; \
161 vpxor RNOT, x1, x1; \
162 vpxor x0, x3, x3;
163
164#define S5_1(x0, x1, x2, x3, x4) \
165 vpor x0, x1, tp; \
166 vpxor tp, x2, x2; \
167 vpxor RNOT, x3, x3; \
168 vpxor x0, x1, x4; \
169 vpxor x2, x0, x0; \
170 vpand x4, tp, x1; \
171 vpor x3, x4, x4; \
172 vpxor x0, x4, x4;
173#define S5_2(x0, x1, x2, x3, x4) \
174 vpand x3, x0, x0; \
175 vpxor x3, x1, x1; \
176 vpxor x2, x3, x3; \
177 vpxor x1, x0, x0; \
178 vpand x4, x2, x2; \
179 vpxor x2, x1, x1; \
180 vpand x0, x2, x2; \
181 vpxor x2, x3, x3;
182
183#define S6_1(x0, x1, x2, x3, x4) \
184 vpxor x0, x3, x3; \
185 vpxor x2, x1, tp; \
186 vpxor x0, x2, x2; \
187 vpand x3, x0, x0; \
188 vpor x3, tp, tp; \
189 vpxor RNOT, x1, x4; \
190 vpxor tp, x0, x0; \
191 vpxor x2, tp, x1;
192#define S6_2(x0, x1, x2, x3, x4) \
193 vpxor x4, x3, x3; \
194 vpxor x0, x4, x4; \
195 vpand x0, x2, x2; \
196 vpxor x1, x4, x4; \
197 vpxor x3, x2, x2; \
198 vpand x1, x3, x3; \
199 vpxor x0, x3, x3; \
200 vpxor x2, x1, x1;
201
202#define S7_1(x0, x1, x2, x3, x4) \
203 vpxor RNOT, x1, tp; \
204 vpxor RNOT, x0, x0; \
205 vpand x2, tp, x1; \
206 vpxor x3, x1, x1; \
207 vpor tp, x3, x3; \
208 vpxor x2, tp, x4; \
209 vpxor x3, x2, x2; \
210 vpxor x0, x3, x3; \
211 vpor x1, x0, x0;
212#define S7_2(x0, x1, x2, x3, x4) \
213 vpand x0, x2, x2; \
214 vpxor x4, x0, x0; \
215 vpxor x3, x4, x4; \
216 vpand x0, x3, x3; \
217 vpxor x1, x4, x4; \
218 vpxor x4, x2, x2; \
219 vpxor x1, x3, x3; \
220 vpor x0, x4, x4; \
221 vpxor x1, x4, x4;
222
223#define SI0_1(x0, x1, x2, x3, x4) \
224 vpxor x0, x1, x1; \
225 vpor x1, x3, tp; \
226 vpxor x1, x3, x4; \
227 vpxor RNOT, x0, x0; \
228 vpxor tp, x2, x2; \
229 vpxor x0, tp, x3; \
230 vpand x1, x0, x0; \
231 vpxor x2, x0, x0;
232#define SI0_2(x0, x1, x2, x3, x4) \
233 vpand x3, x2, x2; \
234 vpxor x4, x3, x3; \
235 vpxor x3, x2, x2; \
236 vpxor x3, x1, x1; \
237 vpand x0, x3, x3; \
238 vpxor x0, x1, x1; \
239 vpxor x2, x0, x0; \
240 vpxor x3, x4, x4;
241
242#define SI1_1(x0, x1, x2, x3, x4) \
243 vpxor x3, x1, x1; \
244 vpxor x2, x0, tp; \
245 vpxor RNOT, x2, x2; \
246 vpor x1, x0, x4; \
247 vpxor x3, x4, x4; \
248 vpand x1, x3, x3; \
249 vpxor x2, x1, x1; \
250 vpand x4, x2, x2;
251#define SI1_2(x0, x1, x2, x3, x4) \
252 vpxor x1, x4, x4; \
253 vpor x3, x1, x1; \
254 vpxor tp, x3, x3; \
255 vpxor tp, x2, x2; \
256 vpor x4, tp, x0; \
257 vpxor x4, x2, x2; \
258 vpxor x0, x1, x1; \
259 vpxor x1, x4, x4;
260
261#define SI2_1(x0, x1, x2, x3, x4) \
262 vpxor x1, x2, x2; \
263 vpxor RNOT, x3, tp; \
264 vpor x2, tp, tp; \
265 vpxor x3, x2, x2; \
266 vpxor x0, x3, x4; \
267 vpxor x1, tp, x3; \
268 vpor x2, x1, x1; \
269 vpxor x0, x2, x2;
270#define SI2_2(x0, x1, x2, x3, x4) \
271 vpxor x4, x1, x1; \
272 vpor x3, x4, x4; \
273 vpxor x3, x2, x2; \
274 vpxor x2, x4, x4; \
275 vpand x1, x2, x2; \
276 vpxor x3, x2, x2; \
277 vpxor x4, x3, x3; \
278 vpxor x0, x4, x4;
279
280#define SI3_1(x0, x1, x2, x3, x4) \
281 vpxor x1, x2, x2; \
282 vpand x2, x1, tp; \
283 vpxor x0, tp, tp; \
284 vpor x1, x0, x0; \
285 vpxor x3, x1, x4; \
286 vpxor x3, x0, x0; \
287 vpor tp, x3, x3; \
288 vpxor x2, tp, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 vpxor x3, x1, x1; \
291 vpxor x2, x0, x0; \
292 vpxor x3, x2, x2; \
293 vpand x1, x3, x3; \
294 vpxor x0, x1, x1; \
295 vpand x2, x0, x0; \
296 vpxor x3, x4, x4; \
297 vpxor x0, x3, x3; \
298 vpxor x1, x0, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 vpxor x3, x2, x2; \
302 vpand x1, x0, tp; \
303 vpxor x2, tp, tp; \
304 vpor x3, x2, x2; \
305 vpxor RNOT, x0, x4; \
306 vpxor tp, x1, x1; \
307 vpxor x2, tp, x0; \
308 vpand x4, x2, x2;
309#define SI4_2(x0, x1, x2, x3, x4) \
310 vpxor x0, x2, x2; \
311 vpor x4, x0, x0; \
312 vpxor x3, x0, x0; \
313 vpand x2, x3, x3; \
314 vpxor x3, x4, x4; \
315 vpxor x1, x3, x3; \
316 vpand x0, x1, x1; \
317 vpxor x1, x4, x4; \
318 vpxor x3, x0, x0;
319
320#define SI5_1(x0, x1, x2, x3, x4) \
321 vpor x2, x1, tp; \
322 vpxor x1, x2, x2; \
323 vpxor x3, tp, tp; \
324 vpand x1, x3, x3; \
325 vpxor x3, x2, x2; \
326 vpor x0, x3, x3; \
327 vpxor RNOT, x0, x0; \
328 vpxor x2, x3, x3; \
329 vpor x0, x2, x2;
330#define SI5_2(x0, x1, x2, x3, x4) \
331 vpxor tp, x1, x4; \
332 vpxor x4, x2, x2; \
333 vpand x0, x4, x4; \
334 vpxor tp, x0, x0; \
335 vpxor x3, tp, x1; \
336 vpand x2, x0, x0; \
337 vpxor x3, x2, x2; \
338 vpxor x2, x0, x0; \
339 vpxor x4, x2, x2; \
340 vpxor x3, x4, x4;
341
342#define SI6_1(x0, x1, x2, x3, x4) \
343 vpxor x2, x0, x0; \
344 vpand x3, x0, tp; \
345 vpxor x3, x2, x2; \
346 vpxor x2, tp, tp; \
347 vpxor x1, x3, x3; \
348 vpor x0, x2, x2; \
349 vpxor x3, x2, x2; \
350 vpand tp, x3, x3;
351#define SI6_2(x0, x1, x2, x3, x4) \
352 vpxor RNOT, tp, tp; \
353 vpxor x1, x3, x3; \
354 vpand x2, x1, x1; \
355 vpxor tp, x0, x4; \
356 vpxor x4, x3, x3; \
357 vpxor x2, x4, x4; \
358 vpxor x1, tp, x0; \
359 vpxor x0, x2, x2;
360
361#define SI7_1(x0, x1, x2, x3, x4) \
362 vpand x0, x3, tp; \
363 vpxor x2, x0, x0; \
364 vpor x3, x2, x2; \
365 vpxor x1, x3, x4; \
366 vpxor RNOT, x0, x0; \
367 vpor tp, x1, x1; \
368 vpxor x0, x4, x4; \
369 vpand x2, x0, x0; \
370 vpxor x1, x0, x0;
371#define SI7_2(x0, x1, x2, x3, x4) \
372 vpand x2, x1, x1; \
373 vpxor x2, tp, x3; \
374 vpxor x3, x4, x4; \
375 vpand x3, x2, x2; \
376 vpor x0, x3, x3; \
377 vpxor x4, x1, x1; \
378 vpxor x4, x3, x3; \
379 vpand x0, x4, x4; \
380 vpxor x2, x4, x4;
381
382#define get_key(i, j, t) \
383 vbroadcastss (4*(i)+(j))*4(CTX), t;
384
385#define K2(x0, x1, x2, x3, x4, i) \
386 get_key(i, 0, RK0); \
387 get_key(i, 1, RK1); \
388 get_key(i, 2, RK2); \
389 get_key(i, 3, RK3); \
390 vpxor RK0, x0 ## 1, x0 ## 1; \
391 vpxor RK1, x1 ## 1, x1 ## 1; \
392 vpxor RK2, x2 ## 1, x2 ## 1; \
393 vpxor RK3, x3 ## 1, x3 ## 1; \
394 vpxor RK0, x0 ## 2, x0 ## 2; \
395 vpxor RK1, x1 ## 2, x1 ## 2; \
396 vpxor RK2, x2 ## 2, x2 ## 2; \
397 vpxor RK3, x3 ## 2, x3 ## 2;
398
399#define LK2(x0, x1, x2, x3, x4, i) \
400 vpslld $13, x0 ## 1, x4 ## 1; \
401 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
402 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
403 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
404 vpslld $3, x2 ## 1, x4 ## 1; \
405 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
406 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
407 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
408 vpslld $13, x0 ## 2, x4 ## 2; \
409 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
410 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
411 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
412 vpslld $3, x2 ## 2, x4 ## 2; \
413 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
414 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
415 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
416 vpslld $1, x1 ## 1, x4 ## 1; \
417 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
418 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
419 vpslld $3, x0 ## 1, x4 ## 1; \
420 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
421 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
422 get_key(i, 1, RK1); \
423 vpslld $1, x1 ## 2, x4 ## 2; \
424 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
425 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
426 vpslld $3, x0 ## 2, x4 ## 2; \
427 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
428 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
429 get_key(i, 3, RK3); \
430 vpslld $7, x3 ## 1, x4 ## 1; \
431 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
432 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
433 vpslld $7, x1 ## 1, x4 ## 1; \
434 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
435 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
436 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
437 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
438 get_key(i, 0, RK0); \
439 vpslld $7, x3 ## 2, x4 ## 2; \
440 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
441 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
442 vpslld $7, x1 ## 2, x4 ## 2; \
443 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
444 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
445 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
446 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
447 get_key(i, 2, RK2); \
448 vpxor RK1, x1 ## 1, x1 ## 1; \
449 vpxor RK3, x3 ## 1, x3 ## 1; \
450 vpslld $5, x0 ## 1, x4 ## 1; \
451 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
452 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
453 vpslld $22, x2 ## 1, x4 ## 1; \
454 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
455 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
456 vpxor RK0, x0 ## 1, x0 ## 1; \
457 vpxor RK2, x2 ## 1, x2 ## 1; \
458 vpxor RK1, x1 ## 2, x1 ## 2; \
459 vpxor RK3, x3 ## 2, x3 ## 2; \
460 vpslld $5, x0 ## 2, x4 ## 2; \
461 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
462 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
463 vpslld $22, x2 ## 2, x4 ## 2; \
464 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
465 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
466 vpxor RK0, x0 ## 2, x0 ## 2; \
467 vpxor RK2, x2 ## 2, x2 ## 2;
468
469#define KL2(x0, x1, x2, x3, x4, i) \
470 vpxor RK0, x0 ## 1, x0 ## 1; \
471 vpxor RK2, x2 ## 1, x2 ## 1; \
472 vpsrld $5, x0 ## 1, x4 ## 1; \
473 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
474 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
475 vpxor RK3, x3 ## 1, x3 ## 1; \
476 vpxor RK1, x1 ## 1, x1 ## 1; \
477 vpsrld $22, x2 ## 1, x4 ## 1; \
478 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
479 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
480 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
481 vpxor RK0, x0 ## 2, x0 ## 2; \
482 vpxor RK2, x2 ## 2, x2 ## 2; \
483 vpsrld $5, x0 ## 2, x4 ## 2; \
484 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
485 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
486 vpxor RK3, x3 ## 2, x3 ## 2; \
487 vpxor RK1, x1 ## 2, x1 ## 2; \
488 vpsrld $22, x2 ## 2, x4 ## 2; \
489 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
490 vpor x4 ## 2, x2 ## 2, x2 ## 2; \
491 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
492 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
493 vpslld $7, x1 ## 1, x4 ## 1; \
494 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
495 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
496 vpsrld $1, x1 ## 1, x4 ## 1; \
497 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
498 vpor x4 ## 1, x1 ## 1, x1 ## 1; \
499 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
500 vpslld $7, x1 ## 2, x4 ## 2; \
501 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
502 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
503 vpsrld $1, x1 ## 2, x4 ## 2; \
504 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
505 vpor x4 ## 2, x1 ## 2, x1 ## 2; \
506 vpsrld $7, x3 ## 1, x4 ## 1; \
507 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
508 vpor x4 ## 1, x3 ## 1, x3 ## 1; \
509 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
510 vpslld $3, x0 ## 1, x4 ## 1; \
511 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
512 vpsrld $7, x3 ## 2, x4 ## 2; \
513 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
514 vpor x4 ## 2, x3 ## 2, x3 ## 2; \
515 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
516 vpslld $3, x0 ## 2, x4 ## 2; \
517 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
518 vpsrld $13, x0 ## 1, x4 ## 1; \
519 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
520 vpor x4 ## 1, x0 ## 1, x0 ## 1; \
521 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
522 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
523 vpsrld $3, x2 ## 1, x4 ## 1; \
524 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
525 vpor x4 ## 1, x2 ## 1, x2 ## 1; \
526 vpsrld $13, x0 ## 2, x4 ## 2; \
527 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
528 vpor x4 ## 2, x0 ## 2, x0 ## 2; \
529 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
530 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
531 vpsrld $3, x2 ## 2, x4 ## 2; \
532 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
533 vpor x4 ## 2, x2 ## 2, x2 ## 2;
534
535#define S(SBOX, x0, x1, x2, x3, x4) \
536 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
537 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
539 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
540
541#define SP(SBOX, x0, x1, x2, x3, x4, i) \
542 get_key(i, 0, RK0); \
543 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
544 get_key(i, 2, RK2); \
545 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
546 get_key(i, 3, RK3); \
547 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
548 get_key(i, 1, RK1); \
549 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
550
551#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
552 vpunpckldq x1, x0, t0; \
553 vpunpckhdq x1, x0, t2; \
554 vpunpckldq x3, x2, t1; \
555 vpunpckhdq x3, x2, x3; \
556 \
557 vpunpcklqdq t1, t0, x0; \
558 vpunpckhqdq t1, t0, x1; \
559 vpunpcklqdq x3, t2, x2; \
560 vpunpckhqdq x3, t2, x3;
561
562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
564
565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
567
568.align 8
569.type __serpent_enc_blk8_avx,@function;
570
571__serpent_enc_blk8_avx:
572 /* input:
573 * %rdi: ctx, CTX
574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
575 * output:
576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
577 */
578
579 vpcmpeqd RNOT, RNOT, RNOT;
580
581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
583
584 K2(RA, RB, RC, RD, RE, 0);
585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
586 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
587 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
588 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
589 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
590 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
591 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
592 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
593 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
594 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
595 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
596 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
597 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
598 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
599 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
600 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
601 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
602 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
603 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
604 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
605 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
606 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
607 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
608 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
609 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
610 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
611 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
612 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
613 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
614 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
617
618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
620
621 ret;
622
623.align 8
624.type __serpent_dec_blk8_avx,@function;
625
626__serpent_dec_blk8_avx:
627 /* input:
628 * %rdi: ctx, CTX
629 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
630 * output:
631 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
632 */
633
634 vpcmpeqd RNOT, RNOT, RNOT;
635
636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
638
639 K2(RA, RB, RC, RD, RE, 32);
640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
641 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
642 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
643 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
644 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
645 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
646 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
647 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
648 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
649 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
650 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
651 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
652 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
653 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
654 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
655 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
656 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
657 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
658 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
659 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
660 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
661 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
662 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
663 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
664 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
665 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
666 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
667 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
668 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
669 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
672
673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
675
676 ret;
677
678.align 8
679.global serpent_ecb_enc_8way_avx
680.type serpent_ecb_enc_8way_avx,@function;
681
682serpent_ecb_enc_8way_avx:
683 /* input:
684 * %rdi: ctx, CTX
685 * %rsi: dst
686 * %rdx: src
687 */
688
689 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
690
691 call __serpent_enc_blk8_avx;
692
693 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
694
695 ret;
696
697.align 8
698.global serpent_ecb_dec_8way_avx
699.type serpent_ecb_dec_8way_avx,@function;
700
701serpent_ecb_dec_8way_avx:
702 /* input:
703 * %rdi: ctx, CTX
704 * %rsi: dst
705 * %rdx: src
706 */
707
708 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
709
710 call __serpent_dec_blk8_avx;
711
712 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
713
714 ret;
715
716.align 8
717.global serpent_cbc_dec_8way_avx
718.type serpent_cbc_dec_8way_avx,@function;
719
720serpent_cbc_dec_8way_avx:
721 /* input:
722 * %rdi: ctx, CTX
723 * %rsi: dst
724 * %rdx: src
725 */
726
727 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
728
729 call __serpent_dec_blk8_avx;
730
731 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
732
733 ret;
734
735.align 8
736.global serpent_ctr_8way_avx
737.type serpent_ctr_8way_avx,@function;
738
739serpent_ctr_8way_avx:
740 /* input:
741 * %rdi: ctx, CTX
742 * %rsi: dst
743 * %rdx: src
744 * %rcx: iv (little endian, 128bit)
745 */
746
747 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
748 RD2, RK0, RK1, RK2);
749
750 call __serpent_enc_blk8_avx;
751
752 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
753
754 ret;
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
deleted file mode 100644
index c00053d42f9..00000000000
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ /dev/null
@@ -1,635 +0,0 @@
1/*
2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-i586-asm_32.S"
28.text
29
30#define arg_ctx 4
31#define arg_dst 8
32#define arg_src 12
33#define arg_xor 16
34
35/**********************************************************************
36 4-way SSE2 serpent
37 **********************************************************************/
38#define CTX %edx
39
40#define RA %xmm0
41#define RB %xmm1
42#define RC %xmm2
43#define RD %xmm3
44#define RE %xmm4
45
46#define RT0 %xmm5
47#define RT1 %xmm6
48
49#define RNOT %xmm7
50
51#define get_key(i, j, t) \
52 movd (4*(i)+(j))*4(CTX), t; \
53 pshufd $0, t, t;
54
55#define K(x0, x1, x2, x3, x4, i) \
56 get_key(i, 0, x4); \
57 get_key(i, 1, RT0); \
58 get_key(i, 2, RT1); \
59 pxor x4, x0; \
60 pxor RT0, x1; \
61 pxor RT1, x2; \
62 get_key(i, 3, x4); \
63 pxor x4, x3;
64
65#define LK(x0, x1, x2, x3, x4, i) \
66 movdqa x0, x4; \
67 pslld $13, x0; \
68 psrld $(32 - 13), x4; \
69 por x4, x0; \
70 pxor x0, x1; \
71 movdqa x2, x4; \
72 pslld $3, x2; \
73 psrld $(32 - 3), x4; \
74 por x4, x2; \
75 pxor x2, x1; \
76 movdqa x1, x4; \
77 pslld $1, x1; \
78 psrld $(32 - 1), x4; \
79 por x4, x1; \
80 movdqa x0, x4; \
81 pslld $3, x4; \
82 pxor x2, x3; \
83 pxor x4, x3; \
84 movdqa x3, x4; \
85 pslld $7, x3; \
86 psrld $(32 - 7), x4; \
87 por x4, x3; \
88 movdqa x1, x4; \
89 pslld $7, x4; \
90 pxor x1, x0; \
91 pxor x3, x0; \
92 pxor x3, x2; \
93 pxor x4, x2; \
94 movdqa x0, x4; \
95 get_key(i, 1, RT0); \
96 pxor RT0, x1; \
97 get_key(i, 3, RT0); \
98 pxor RT0, x3; \
99 pslld $5, x0; \
100 psrld $(32 - 5), x4; \
101 por x4, x0; \
102 movdqa x2, x4; \
103 pslld $22, x2; \
104 psrld $(32 - 22), x4; \
105 por x4, x2; \
106 get_key(i, 0, RT0); \
107 pxor RT0, x0; \
108 get_key(i, 2, RT0); \
109 pxor RT0, x2;
110
111#define KL(x0, x1, x2, x3, x4, i) \
112 K(x0, x1, x2, x3, x4, i); \
113 movdqa x0, x4; \
114 psrld $5, x0; \
115 pslld $(32 - 5), x4; \
116 por x4, x0; \
117 movdqa x2, x4; \
118 psrld $22, x2; \
119 pslld $(32 - 22), x4; \
120 por x4, x2; \
121 pxor x3, x2; \
122 pxor x3, x0; \
123 movdqa x1, x4; \
124 pslld $7, x4; \
125 pxor x1, x0; \
126 pxor x4, x2; \
127 movdqa x1, x4; \
128 psrld $1, x1; \
129 pslld $(32 - 1), x4; \
130 por x4, x1; \
131 movdqa x3, x4; \
132 psrld $7, x3; \
133 pslld $(32 - 7), x4; \
134 por x4, x3; \
135 pxor x0, x1; \
136 movdqa x0, x4; \
137 pslld $3, x4; \
138 pxor x4, x3; \
139 movdqa x0, x4; \
140 psrld $13, x0; \
141 pslld $(32 - 13), x4; \
142 por x4, x0; \
143 pxor x2, x1; \
144 pxor x2, x3; \
145 movdqa x2, x4; \
146 psrld $3, x2; \
147 pslld $(32 - 3), x4; \
148 por x4, x2;
149
150#define S0(x0, x1, x2, x3, x4) \
151 movdqa x3, x4; \
152 por x0, x3; \
153 pxor x4, x0; \
154 pxor x2, x4; \
155 pxor RNOT, x4; \
156 pxor x1, x3; \
157 pand x0, x1; \
158 pxor x4, x1; \
159 pxor x0, x2; \
160 pxor x3, x0; \
161 por x0, x4; \
162 pxor x2, x0; \
163 pand x1, x2; \
164 pxor x2, x3; \
165 pxor RNOT, x1; \
166 pxor x4, x2; \
167 pxor x2, x1;
168
169#define S1(x0, x1, x2, x3, x4) \
170 movdqa x1, x4; \
171 pxor x0, x1; \
172 pxor x3, x0; \
173 pxor RNOT, x3; \
174 pand x1, x4; \
175 por x1, x0; \
176 pxor x2, x3; \
177 pxor x3, x0; \
178 pxor x3, x1; \
179 pxor x4, x3; \
180 por x4, x1; \
181 pxor x2, x4; \
182 pand x0, x2; \
183 pxor x1, x2; \
184 por x0, x1; \
185 pxor RNOT, x0; \
186 pxor x2, x0; \
187 pxor x1, x4;
188
189#define S2(x0, x1, x2, x3, x4) \
190 pxor RNOT, x3; \
191 pxor x0, x1; \
192 movdqa x0, x4; \
193 pand x2, x0; \
194 pxor x3, x0; \
195 por x4, x3; \
196 pxor x1, x2; \
197 pxor x1, x3; \
198 pand x0, x1; \
199 pxor x2, x0; \
200 pand x3, x2; \
201 por x1, x3; \
202 pxor RNOT, x0; \
203 pxor x0, x3; \
204 pxor x0, x4; \
205 pxor x2, x0; \
206 por x2, x1;
207
208#define S3(x0, x1, x2, x3, x4) \
209 movdqa x1, x4; \
210 pxor x3, x1; \
211 por x0, x3; \
212 pand x0, x4; \
213 pxor x2, x0; \
214 pxor x1, x2; \
215 pand x3, x1; \
216 pxor x3, x2; \
217 por x4, x0; \
218 pxor x3, x4; \
219 pxor x0, x1; \
220 pand x3, x0; \
221 pand x4, x3; \
222 pxor x2, x3; \
223 por x1, x4; \
224 pand x1, x2; \
225 pxor x3, x4; \
226 pxor x3, x0; \
227 pxor x2, x3;
228
229#define S4(x0, x1, x2, x3, x4) \
230 movdqa x3, x4; \
231 pand x0, x3; \
232 pxor x4, x0; \
233 pxor x2, x3; \
234 por x4, x2; \
235 pxor x1, x0; \
236 pxor x3, x4; \
237 por x0, x2; \
238 pxor x1, x2; \
239 pand x0, x1; \
240 pxor x4, x1; \
241 pand x2, x4; \
242 pxor x3, x2; \
243 pxor x0, x4; \
244 por x1, x3; \
245 pxor RNOT, x1; \
246 pxor x0, x3;
247
248#define S5(x0, x1, x2, x3, x4) \
249 movdqa x1, x4; \
250 por x0, x1; \
251 pxor x1, x2; \
252 pxor RNOT, x3; \
253 pxor x0, x4; \
254 pxor x2, x0; \
255 pand x4, x1; \
256 por x3, x4; \
257 pxor x0, x4; \
258 pand x3, x0; \
259 pxor x3, x1; \
260 pxor x2, x3; \
261 pxor x1, x0; \
262 pand x4, x2; \
263 pxor x2, x1; \
264 pand x0, x2; \
265 pxor x2, x3;
266
267#define S6(x0, x1, x2, x3, x4) \
268 movdqa x1, x4; \
269 pxor x0, x3; \
270 pxor x2, x1; \
271 pxor x0, x2; \
272 pand x3, x0; \
273 por x3, x1; \
274 pxor RNOT, x4; \
275 pxor x1, x0; \
276 pxor x2, x1; \
277 pxor x4, x3; \
278 pxor x0, x4; \
279 pand x0, x2; \
280 pxor x1, x4; \
281 pxor x3, x2; \
282 pand x1, x3; \
283 pxor x0, x3; \
284 pxor x2, x1;
285
286#define S7(x0, x1, x2, x3, x4) \
287 pxor RNOT, x1; \
288 movdqa x1, x4; \
289 pxor RNOT, x0; \
290 pand x2, x1; \
291 pxor x3, x1; \
292 por x4, x3; \
293 pxor x2, x4; \
294 pxor x3, x2; \
295 pxor x0, x3; \
296 por x1, x0; \
297 pand x0, x2; \
298 pxor x4, x0; \
299 pxor x3, x4; \
300 pand x0, x3; \
301 pxor x1, x4; \
302 pxor x4, x2; \
303 pxor x1, x3; \
304 por x0, x4; \
305 pxor x1, x4;
306
307#define SI0(x0, x1, x2, x3, x4) \
308 movdqa x3, x4; \
309 pxor x0, x1; \
310 por x1, x3; \
311 pxor x1, x4; \
312 pxor RNOT, x0; \
313 pxor x3, x2; \
314 pxor x0, x3; \
315 pand x1, x0; \
316 pxor x2, x0; \
317 pand x3, x2; \
318 pxor x4, x3; \
319 pxor x3, x2; \
320 pxor x3, x1; \
321 pand x0, x3; \
322 pxor x0, x1; \
323 pxor x2, x0; \
324 pxor x3, x4;
325
326#define SI1(x0, x1, x2, x3, x4) \
327 pxor x3, x1; \
328 movdqa x0, x4; \
329 pxor x2, x0; \
330 pxor RNOT, x2; \
331 por x1, x4; \
332 pxor x3, x4; \
333 pand x1, x3; \
334 pxor x2, x1; \
335 pand x4, x2; \
336 pxor x1, x4; \
337 por x3, x1; \
338 pxor x0, x3; \
339 pxor x0, x2; \
340 por x4, x0; \
341 pxor x4, x2; \
342 pxor x0, x1; \
343 pxor x1, x4;
344
345#define SI2(x0, x1, x2, x3, x4) \
346 pxor x1, x2; \
347 movdqa x3, x4; \
348 pxor RNOT, x3; \
349 por x2, x3; \
350 pxor x4, x2; \
351 pxor x0, x4; \
352 pxor x1, x3; \
353 por x2, x1; \
354 pxor x0, x2; \
355 pxor x4, x1; \
356 por x3, x4; \
357 pxor x3, x2; \
358 pxor x2, x4; \
359 pand x1, x2; \
360 pxor x3, x2; \
361 pxor x4, x3; \
362 pxor x0, x4;
363
364#define SI3(x0, x1, x2, x3, x4) \
365 pxor x1, x2; \
366 movdqa x1, x4; \
367 pand x2, x1; \
368 pxor x0, x1; \
369 por x4, x0; \
370 pxor x3, x4; \
371 pxor x3, x0; \
372 por x1, x3; \
373 pxor x2, x1; \
374 pxor x3, x1; \
375 pxor x2, x0; \
376 pxor x3, x2; \
377 pand x1, x3; \
378 pxor x0, x1; \
379 pand x2, x0; \
380 pxor x3, x4; \
381 pxor x0, x3; \
382 pxor x1, x0;
383
384#define SI4(x0, x1, x2, x3, x4) \
385 pxor x3, x2; \
386 movdqa x0, x4; \
387 pand x1, x0; \
388 pxor x2, x0; \
389 por x3, x2; \
390 pxor RNOT, x4; \
391 pxor x0, x1; \
392 pxor x2, x0; \
393 pand x4, x2; \
394 pxor x0, x2; \
395 por x4, x0; \
396 pxor x3, x0; \
397 pand x2, x3; \
398 pxor x3, x4; \
399 pxor x1, x3; \
400 pand x0, x1; \
401 pxor x1, x4; \
402 pxor x3, x0;
403
404#define SI5(x0, x1, x2, x3, x4) \
405 movdqa x1, x4; \
406 por x2, x1; \
407 pxor x4, x2; \
408 pxor x3, x1; \
409 pand x4, x3; \
410 pxor x3, x2; \
411 por x0, x3; \
412 pxor RNOT, x0; \
413 pxor x2, x3; \
414 por x0, x2; \
415 pxor x1, x4; \
416 pxor x4, x2; \
417 pand x0, x4; \
418 pxor x1, x0; \
419 pxor x3, x1; \
420 pand x2, x0; \
421 pxor x3, x2; \
422 pxor x2, x0; \
423 pxor x4, x2; \
424 pxor x3, x4;
425
426#define SI6(x0, x1, x2, x3, x4) \
427 pxor x2, x0; \
428 movdqa x0, x4; \
429 pand x3, x0; \
430 pxor x3, x2; \
431 pxor x2, x0; \
432 pxor x1, x3; \
433 por x4, x2; \
434 pxor x3, x2; \
435 pand x0, x3; \
436 pxor RNOT, x0; \
437 pxor x1, x3; \
438 pand x2, x1; \
439 pxor x0, x4; \
440 pxor x4, x3; \
441 pxor x2, x4; \
442 pxor x1, x0; \
443 pxor x0, x2;
444
445#define SI7(x0, x1, x2, x3, x4) \
446 movdqa x3, x4; \
447 pand x0, x3; \
448 pxor x2, x0; \
449 por x4, x2; \
450 pxor x1, x4; \
451 pxor RNOT, x0; \
452 por x3, x1; \
453 pxor x0, x4; \
454 pand x2, x0; \
455 pxor x1, x0; \
456 pand x2, x1; \
457 pxor x2, x3; \
458 pxor x3, x4; \
459 pand x3, x2; \
460 por x0, x3; \
461 pxor x4, x1; \
462 pxor x4, x3; \
463 pand x0, x4; \
464 pxor x2, x4;
465
466#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
467 movdqa x0, t2; \
468 punpckldq x1, x0; \
469 punpckhdq x1, t2; \
470 movdqa x2, t1; \
471 punpckhdq x3, x2; \
472 punpckldq x3, t1; \
473 movdqa x0, x1; \
474 punpcklqdq t1, x0; \
475 punpckhqdq t1, x1; \
476 movdqa t2, x3; \
477 punpcklqdq x2, t2; \
478 punpckhqdq x2, x3; \
479 movdqa t2, x2;
480
481#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
482 movdqu (0*4*4)(in), x0; \
483 movdqu (1*4*4)(in), x1; \
484 movdqu (2*4*4)(in), x2; \
485 movdqu (3*4*4)(in), x3; \
486 \
487 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
488
489#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
490 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
491 \
492 movdqu x0, (0*4*4)(out); \
493 movdqu x1, (1*4*4)(out); \
494 movdqu x2, (2*4*4)(out); \
495 movdqu x3, (3*4*4)(out);
496
497#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
498 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
499 \
500 movdqu (0*4*4)(out), t0; \
501 pxor t0, x0; \
502 movdqu x0, (0*4*4)(out); \
503 movdqu (1*4*4)(out), t0; \
504 pxor t0, x1; \
505 movdqu x1, (1*4*4)(out); \
506 movdqu (2*4*4)(out), t0; \
507 pxor t0, x2; \
508 movdqu x2, (2*4*4)(out); \
509 movdqu (3*4*4)(out), t0; \
510 pxor t0, x3; \
511 movdqu x3, (3*4*4)(out);
512
513.align 8
514.global __serpent_enc_blk_4way
515.type __serpent_enc_blk_4way,@function;
516
517__serpent_enc_blk_4way:
518 /* input:
519 * arg_ctx(%esp): ctx, CTX
520 * arg_dst(%esp): dst
521 * arg_src(%esp): src
522 * arg_xor(%esp): bool, if true: xor output
523 */
524
525 pcmpeqd RNOT, RNOT;
526
527 movl arg_ctx(%esp), CTX;
528
529 movl arg_src(%esp), %eax;
530 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
531
532 K(RA, RB, RC, RD, RE, 0);
533 S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
534 S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
535 S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
536 S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
537 S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
538 S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
539 S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
540 S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
541 S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
542 S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
543 S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
544 S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
545 S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
546 S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
547 S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
548 S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
549 S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
550 S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
551 S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
552 S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
553 S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
554 S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
555 S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
556 S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
557 S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
558 S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
559 S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
560 S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
561 S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
562 S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
563 S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
564 S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
565
566 movl arg_dst(%esp), %eax;
567
568 cmpb $0, arg_xor(%esp);
569 jnz __enc_xor4;
570
571 write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
572
573 ret;
574
575__enc_xor4:
576 xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
577
578 ret;
579
580.align 8
581.global serpent_dec_blk_4way
582.type serpent_dec_blk_4way,@function;
583
584serpent_dec_blk_4way:
585 /* input:
586 * arg_ctx(%esp): ctx, CTX
587 * arg_dst(%esp): dst
588 * arg_src(%esp): src
589 */
590
591 pcmpeqd RNOT, RNOT;
592
593 movl arg_ctx(%esp), CTX;
594
595 movl arg_src(%esp), %eax;
596 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
597
598 K(RA, RB, RC, RD, RE, 32);
599 SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
600 SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
601 SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
602 SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
603 SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
604 SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
605 SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
606 SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
607 SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
608 SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
609 SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
610 SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
611 SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
612 SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
613 SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
614 SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
615 SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
616 SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
617 SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
618 SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
619 SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
620 SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
621 SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
622 SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
623 SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
624 SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
625 SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
626 SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
627 SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
628 SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
629 SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
630 SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
631
632 movl arg_dst(%esp), %eax;
633 write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
634
635 ret;
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
deleted file mode 100644
index 3ee1ff04d3e..00000000000
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ /dev/null
@@ -1,758 +0,0 @@
1/*
2 * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-x86_64-asm_64.S"
28.text
29
30#define CTX %rdi
31
32/**********************************************************************
33 8-way SSE2 serpent
34 **********************************************************************/
35#define RA1 %xmm0
36#define RB1 %xmm1
37#define RC1 %xmm2
38#define RD1 %xmm3
39#define RE1 %xmm4
40
41#define RA2 %xmm5
42#define RB2 %xmm6
43#define RC2 %xmm7
44#define RD2 %xmm8
45#define RE2 %xmm9
46
47#define RNOT %xmm10
48
49#define RK0 %xmm11
50#define RK1 %xmm12
51#define RK2 %xmm13
52#define RK3 %xmm14
53
54#define S0_1(x0, x1, x2, x3, x4) \
55 movdqa x3, x4; \
56 por x0, x3; \
57 pxor x4, x0; \
58 pxor x2, x4; \
59 pxor RNOT, x4; \
60 pxor x1, x3; \
61 pand x0, x1; \
62 pxor x4, x1; \
63 pxor x0, x2;
64#define S0_2(x0, x1, x2, x3, x4) \
65 pxor x3, x0; \
66 por x0, x4; \
67 pxor x2, x0; \
68 pand x1, x2; \
69 pxor x2, x3; \
70 pxor RNOT, x1; \
71 pxor x4, x2; \
72 pxor x2, x1;
73
74#define S1_1(x0, x1, x2, x3, x4) \
75 movdqa x1, x4; \
76 pxor x0, x1; \
77 pxor x3, x0; \
78 pxor RNOT, x3; \
79 pand x1, x4; \
80 por x1, x0; \
81 pxor x2, x3; \
82 pxor x3, x0; \
83 pxor x3, x1;
84#define S1_2(x0, x1, x2, x3, x4) \
85 pxor x4, x3; \
86 por x4, x1; \
87 pxor x2, x4; \
88 pand x0, x2; \
89 pxor x1, x2; \
90 por x0, x1; \
91 pxor RNOT, x0; \
92 pxor x2, x0; \
93 pxor x1, x4;
94
95#define S2_1(x0, x1, x2, x3, x4) \
96 pxor RNOT, x3; \
97 pxor x0, x1; \
98 movdqa x0, x4; \
99 pand x2, x0; \
100 pxor x3, x0; \
101 por x4, x3; \
102 pxor x1, x2; \
103 pxor x1, x3; \
104 pand x0, x1;
105#define S2_2(x0, x1, x2, x3, x4) \
106 pxor x2, x0; \
107 pand x3, x2; \
108 por x1, x3; \
109 pxor RNOT, x0; \
110 pxor x0, x3; \
111 pxor x0, x4; \
112 pxor x2, x0; \
113 por x2, x1;
114
115#define S3_1(x0, x1, x2, x3, x4) \
116 movdqa x1, x4; \
117 pxor x3, x1; \
118 por x0, x3; \
119 pand x0, x4; \
120 pxor x2, x0; \
121 pxor x1, x2; \
122 pand x3, x1; \
123 pxor x3, x2; \
124 por x4, x0; \
125 pxor x3, x4;
126#define S3_2(x0, x1, x2, x3, x4) \
127 pxor x0, x1; \
128 pand x3, x0; \
129 pand x4, x3; \
130 pxor x2, x3; \
131 por x1, x4; \
132 pand x1, x2; \
133 pxor x3, x4; \
134 pxor x3, x0; \
135 pxor x2, x3;
136
137#define S4_1(x0, x1, x2, x3, x4) \
138 movdqa x3, x4; \
139 pand x0, x3; \
140 pxor x4, x0; \
141 pxor x2, x3; \
142 por x4, x2; \
143 pxor x1, x0; \
144 pxor x3, x4; \
145 por x0, x2; \
146 pxor x1, x2;
147#define S4_2(x0, x1, x2, x3, x4) \
148 pand x0, x1; \
149 pxor x4, x1; \
150 pand x2, x4; \
151 pxor x3, x2; \
152 pxor x0, x4; \
153 por x1, x3; \
154 pxor RNOT, x1; \
155 pxor x0, x3;
156
157#define S5_1(x0, x1, x2, x3, x4) \
158 movdqa x1, x4; \
159 por x0, x1; \
160 pxor x1, x2; \
161 pxor RNOT, x3; \
162 pxor x0, x4; \
163 pxor x2, x0; \
164 pand x4, x1; \
165 por x3, x4; \
166 pxor x0, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 pand x3, x0; \
169 pxor x3, x1; \
170 pxor x2, x3; \
171 pxor x1, x0; \
172 pand x4, x2; \
173 pxor x2, x1; \
174 pand x0, x2; \
175 pxor x2, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 movdqa x1, x4; \
179 pxor x0, x3; \
180 pxor x2, x1; \
181 pxor x0, x2; \
182 pand x3, x0; \
183 por x3, x1; \
184 pxor RNOT, x4; \
185 pxor x1, x0; \
186 pxor x2, x1;
187#define S6_2(x0, x1, x2, x3, x4) \
188 pxor x4, x3; \
189 pxor x0, x4; \
190 pand x0, x2; \
191 pxor x1, x4; \
192 pxor x3, x2; \
193 pand x1, x3; \
194 pxor x0, x3; \
195 pxor x2, x1;
196
197#define S7_1(x0, x1, x2, x3, x4) \
198 pxor RNOT, x1; \
199 movdqa x1, x4; \
200 pxor RNOT, x0; \
201 pand x2, x1; \
202 pxor x3, x1; \
203 por x4, x3; \
204 pxor x2, x4; \
205 pxor x3, x2; \
206 pxor x0, x3; \
207 por x1, x0;
208#define S7_2(x0, x1, x2, x3, x4) \
209 pand x0, x2; \
210 pxor x4, x0; \
211 pxor x3, x4; \
212 pand x0, x3; \
213 pxor x1, x4; \
214 pxor x4, x2; \
215 pxor x1, x3; \
216 por x0, x4; \
217 pxor x1, x4;
218
219#define SI0_1(x0, x1, x2, x3, x4) \
220 movdqa x3, x4; \
221 pxor x0, x1; \
222 por x1, x3; \
223 pxor x1, x4; \
224 pxor RNOT, x0; \
225 pxor x3, x2; \
226 pxor x0, x3; \
227 pand x1, x0; \
228 pxor x2, x0;
229#define SI0_2(x0, x1, x2, x3, x4) \
230 pand x3, x2; \
231 pxor x4, x3; \
232 pxor x3, x2; \
233 pxor x3, x1; \
234 pand x0, x3; \
235 pxor x0, x1; \
236 pxor x2, x0; \
237 pxor x3, x4;
238
239#define SI1_1(x0, x1, x2, x3, x4) \
240 pxor x3, x1; \
241 movdqa x0, x4; \
242 pxor x2, x0; \
243 pxor RNOT, x2; \
244 por x1, x4; \
245 pxor x3, x4; \
246 pand x1, x3; \
247 pxor x2, x1; \
248 pand x4, x2;
249#define SI1_2(x0, x1, x2, x3, x4) \
250 pxor x1, x4; \
251 por x3, x1; \
252 pxor x0, x3; \
253 pxor x0, x2; \
254 por x4, x0; \
255 pxor x4, x2; \
256 pxor x0, x1; \
257 pxor x1, x4;
258
259#define SI2_1(x0, x1, x2, x3, x4) \
260 pxor x1, x2; \
261 movdqa x3, x4; \
262 pxor RNOT, x3; \
263 por x2, x3; \
264 pxor x4, x2; \
265 pxor x0, x4; \
266 pxor x1, x3; \
267 por x2, x1; \
268 pxor x0, x2;
269#define SI2_2(x0, x1, x2, x3, x4) \
270 pxor x4, x1; \
271 por x3, x4; \
272 pxor x3, x2; \
273 pxor x2, x4; \
274 pand x1, x2; \
275 pxor x3, x2; \
276 pxor x4, x3; \
277 pxor x0, x4;
278
279#define SI3_1(x0, x1, x2, x3, x4) \
280 pxor x1, x2; \
281 movdqa x1, x4; \
282 pand x2, x1; \
283 pxor x0, x1; \
284 por x4, x0; \
285 pxor x3, x4; \
286 pxor x3, x0; \
287 por x1, x3; \
288 pxor x2, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 pxor x3, x1; \
291 pxor x2, x0; \
292 pxor x3, x2; \
293 pand x1, x3; \
294 pxor x0, x1; \
295 pand x2, x0; \
296 pxor x3, x4; \
297 pxor x0, x3; \
298 pxor x1, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 pxor x3, x2; \
302 movdqa x0, x4; \
303 pand x1, x0; \
304 pxor x2, x0; \
305 por x3, x2; \
306 pxor RNOT, x4; \
307 pxor x0, x1; \
308 pxor x2, x0; \
309 pand x4, x2;
310#define SI4_2(x0, x1, x2, x3, x4) \
311 pxor x0, x2; \
312 por x4, x0; \
313 pxor x3, x0; \
314 pand x2, x3; \
315 pxor x3, x4; \
316 pxor x1, x3; \
317 pand x0, x1; \
318 pxor x1, x4; \
319 pxor x3, x0;
320
321#define SI5_1(x0, x1, x2, x3, x4) \
322 movdqa x1, x4; \
323 por x2, x1; \
324 pxor x4, x2; \
325 pxor x3, x1; \
326 pand x4, x3; \
327 pxor x3, x2; \
328 por x0, x3; \
329 pxor RNOT, x0; \
330 pxor x2, x3; \
331 por x0, x2;
332#define SI5_2(x0, x1, x2, x3, x4) \
333 pxor x1, x4; \
334 pxor x4, x2; \
335 pand x0, x4; \
336 pxor x1, x0; \
337 pxor x3, x1; \
338 pand x2, x0; \
339 pxor x3, x2; \
340 pxor x2, x0; \
341 pxor x4, x2; \
342 pxor x3, x4;
343
344#define SI6_1(x0, x1, x2, x3, x4) \
345 pxor x2, x0; \
346 movdqa x0, x4; \
347 pand x3, x0; \
348 pxor x3, x2; \
349 pxor x2, x0; \
350 pxor x1, x3; \
351 por x4, x2; \
352 pxor x3, x2; \
353 pand x0, x3;
354#define SI6_2(x0, x1, x2, x3, x4) \
355 pxor RNOT, x0; \
356 pxor x1, x3; \
357 pand x2, x1; \
358 pxor x0, x4; \
359 pxor x4, x3; \
360 pxor x2, x4; \
361 pxor x1, x0; \
362 pxor x0, x2;
363
364#define SI7_1(x0, x1, x2, x3, x4) \
365 movdqa x3, x4; \
366 pand x0, x3; \
367 pxor x2, x0; \
368 por x4, x2; \
369 pxor x1, x4; \
370 pxor RNOT, x0; \
371 por x3, x1; \
372 pxor x0, x4; \
373 pand x2, x0; \
374 pxor x1, x0;
375#define SI7_2(x0, x1, x2, x3, x4) \
376 pand x2, x1; \
377 pxor x2, x3; \
378 pxor x3, x4; \
379 pand x3, x2; \
380 por x0, x3; \
381 pxor x4, x1; \
382 pxor x4, x3; \
383 pand x0, x4; \
384 pxor x2, x4;
385
386#define get_key(i, j, t) \
387 movd (4*(i)+(j))*4(CTX), t; \
388 pshufd $0, t, t;
389
390#define K2(x0, x1, x2, x3, x4, i) \
391 get_key(i, 0, RK0); \
392 get_key(i, 1, RK1); \
393 get_key(i, 2, RK2); \
394 get_key(i, 3, RK3); \
395 pxor RK0, x0 ## 1; \
396 pxor RK1, x1 ## 1; \
397 pxor RK2, x2 ## 1; \
398 pxor RK3, x3 ## 1; \
399 pxor RK0, x0 ## 2; \
400 pxor RK1, x1 ## 2; \
401 pxor RK2, x2 ## 2; \
402 pxor RK3, x3 ## 2;
403
404#define LK2(x0, x1, x2, x3, x4, i) \
405 movdqa x0 ## 1, x4 ## 1; \
406 pslld $13, x0 ## 1; \
407 psrld $(32 - 13), x4 ## 1; \
408 por x4 ## 1, x0 ## 1; \
409 pxor x0 ## 1, x1 ## 1; \
410 movdqa x2 ## 1, x4 ## 1; \
411 pslld $3, x2 ## 1; \
412 psrld $(32 - 3), x4 ## 1; \
413 por x4 ## 1, x2 ## 1; \
414 pxor x2 ## 1, x1 ## 1; \
415 movdqa x0 ## 2, x4 ## 2; \
416 pslld $13, x0 ## 2; \
417 psrld $(32 - 13), x4 ## 2; \
418 por x4 ## 2, x0 ## 2; \
419 pxor x0 ## 2, x1 ## 2; \
420 movdqa x2 ## 2, x4 ## 2; \
421 pslld $3, x2 ## 2; \
422 psrld $(32 - 3), x4 ## 2; \
423 por x4 ## 2, x2 ## 2; \
424 pxor x2 ## 2, x1 ## 2; \
425 movdqa x1 ## 1, x4 ## 1; \
426 pslld $1, x1 ## 1; \
427 psrld $(32 - 1), x4 ## 1; \
428 por x4 ## 1, x1 ## 1; \
429 movdqa x0 ## 1, x4 ## 1; \
430 pslld $3, x4 ## 1; \
431 pxor x2 ## 1, x3 ## 1; \
432 pxor x4 ## 1, x3 ## 1; \
433 movdqa x3 ## 1, x4 ## 1; \
434 get_key(i, 1, RK1); \
435 movdqa x1 ## 2, x4 ## 2; \
436 pslld $1, x1 ## 2; \
437 psrld $(32 - 1), x4 ## 2; \
438 por x4 ## 2, x1 ## 2; \
439 movdqa x0 ## 2, x4 ## 2; \
440 pslld $3, x4 ## 2; \
441 pxor x2 ## 2, x3 ## 2; \
442 pxor x4 ## 2, x3 ## 2; \
443 movdqa x3 ## 2, x4 ## 2; \
444 get_key(i, 3, RK3); \
445 pslld $7, x3 ## 1; \
446 psrld $(32 - 7), x4 ## 1; \
447 por x4 ## 1, x3 ## 1; \
448 movdqa x1 ## 1, x4 ## 1; \
449 pslld $7, x4 ## 1; \
450 pxor x1 ## 1, x0 ## 1; \
451 pxor x3 ## 1, x0 ## 1; \
452 pxor x3 ## 1, x2 ## 1; \
453 pxor x4 ## 1, x2 ## 1; \
454 get_key(i, 0, RK0); \
455 pslld $7, x3 ## 2; \
456 psrld $(32 - 7), x4 ## 2; \
457 por x4 ## 2, x3 ## 2; \
458 movdqa x1 ## 2, x4 ## 2; \
459 pslld $7, x4 ## 2; \
460 pxor x1 ## 2, x0 ## 2; \
461 pxor x3 ## 2, x0 ## 2; \
462 pxor x3 ## 2, x2 ## 2; \
463 pxor x4 ## 2, x2 ## 2; \
464 get_key(i, 2, RK2); \
465 pxor RK1, x1 ## 1; \
466 pxor RK3, x3 ## 1; \
467 movdqa x0 ## 1, x4 ## 1; \
468 pslld $5, x0 ## 1; \
469 psrld $(32 - 5), x4 ## 1; \
470 por x4 ## 1, x0 ## 1; \
471 movdqa x2 ## 1, x4 ## 1; \
472 pslld $22, x2 ## 1; \
473 psrld $(32 - 22), x4 ## 1; \
474 por x4 ## 1, x2 ## 1; \
475 pxor RK0, x0 ## 1; \
476 pxor RK2, x2 ## 1; \
477 pxor RK1, x1 ## 2; \
478 pxor RK3, x3 ## 2; \
479 movdqa x0 ## 2, x4 ## 2; \
480 pslld $5, x0 ## 2; \
481 psrld $(32 - 5), x4 ## 2; \
482 por x4 ## 2, x0 ## 2; \
483 movdqa x2 ## 2, x4 ## 2; \
484 pslld $22, x2 ## 2; \
485 psrld $(32 - 22), x4 ## 2; \
486 por x4 ## 2, x2 ## 2; \
487 pxor RK0, x0 ## 2; \
488 pxor RK2, x2 ## 2;
489
490#define KL2(x0, x1, x2, x3, x4, i) \
491 pxor RK0, x0 ## 1; \
492 pxor RK2, x2 ## 1; \
493 movdqa x0 ## 1, x4 ## 1; \
494 psrld $5, x0 ## 1; \
495 pslld $(32 - 5), x4 ## 1; \
496 por x4 ## 1, x0 ## 1; \
497 pxor RK3, x3 ## 1; \
498 pxor RK1, x1 ## 1; \
499 movdqa x2 ## 1, x4 ## 1; \
500 psrld $22, x2 ## 1; \
501 pslld $(32 - 22), x4 ## 1; \
502 por x4 ## 1, x2 ## 1; \
503 pxor x3 ## 1, x2 ## 1; \
504 pxor RK0, x0 ## 2; \
505 pxor RK2, x2 ## 2; \
506 movdqa x0 ## 2, x4 ## 2; \
507 psrld $5, x0 ## 2; \
508 pslld $(32 - 5), x4 ## 2; \
509 por x4 ## 2, x0 ## 2; \
510 pxor RK3, x3 ## 2; \
511 pxor RK1, x1 ## 2; \
512 movdqa x2 ## 2, x4 ## 2; \
513 psrld $22, x2 ## 2; \
514 pslld $(32 - 22), x4 ## 2; \
515 por x4 ## 2, x2 ## 2; \
516 pxor x3 ## 2, x2 ## 2; \
517 pxor x3 ## 1, x0 ## 1; \
518 movdqa x1 ## 1, x4 ## 1; \
519 pslld $7, x4 ## 1; \
520 pxor x1 ## 1, x0 ## 1; \
521 pxor x4 ## 1, x2 ## 1; \
522 movdqa x1 ## 1, x4 ## 1; \
523 psrld $1, x1 ## 1; \
524 pslld $(32 - 1), x4 ## 1; \
525 por x4 ## 1, x1 ## 1; \
526 pxor x3 ## 2, x0 ## 2; \
527 movdqa x1 ## 2, x4 ## 2; \
528 pslld $7, x4 ## 2; \
529 pxor x1 ## 2, x0 ## 2; \
530 pxor x4 ## 2, x2 ## 2; \
531 movdqa x1 ## 2, x4 ## 2; \
532 psrld $1, x1 ## 2; \
533 pslld $(32 - 1), x4 ## 2; \
534 por x4 ## 2, x1 ## 2; \
535 movdqa x3 ## 1, x4 ## 1; \
536 psrld $7, x3 ## 1; \
537 pslld $(32 - 7), x4 ## 1; \
538 por x4 ## 1, x3 ## 1; \
539 pxor x0 ## 1, x1 ## 1; \
540 movdqa x0 ## 1, x4 ## 1; \
541 pslld $3, x4 ## 1; \
542 pxor x4 ## 1, x3 ## 1; \
543 movdqa x0 ## 1, x4 ## 1; \
544 movdqa x3 ## 2, x4 ## 2; \
545 psrld $7, x3 ## 2; \
546 pslld $(32 - 7), x4 ## 2; \
547 por x4 ## 2, x3 ## 2; \
548 pxor x0 ## 2, x1 ## 2; \
549 movdqa x0 ## 2, x4 ## 2; \
550 pslld $3, x4 ## 2; \
551 pxor x4 ## 2, x3 ## 2; \
552 movdqa x0 ## 2, x4 ## 2; \
553 psrld $13, x0 ## 1; \
554 pslld $(32 - 13), x4 ## 1; \
555 por x4 ## 1, x0 ## 1; \
556 pxor x2 ## 1, x1 ## 1; \
557 pxor x2 ## 1, x3 ## 1; \
558 movdqa x2 ## 1, x4 ## 1; \
559 psrld $3, x2 ## 1; \
560 pslld $(32 - 3), x4 ## 1; \
561 por x4 ## 1, x2 ## 1; \
562 psrld $13, x0 ## 2; \
563 pslld $(32 - 13), x4 ## 2; \
564 por x4 ## 2, x0 ## 2; \
565 pxor x2 ## 2, x1 ## 2; \
566 pxor x2 ## 2, x3 ## 2; \
567 movdqa x2 ## 2, x4 ## 2; \
568 psrld $3, x2 ## 2; \
569 pslld $(32 - 3), x4 ## 2; \
570 por x4 ## 2, x2 ## 2;
571
572#define S(SBOX, x0, x1, x2, x3, x4) \
573 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
574 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
575 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
576 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
577
578#define SP(SBOX, x0, x1, x2, x3, x4, i) \
579 get_key(i, 0, RK0); \
580 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
581 get_key(i, 2, RK2); \
582 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
583 get_key(i, 3, RK3); \
584 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
585 get_key(i, 1, RK1); \
586 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
587
588#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
589 movdqa x0, t2; \
590 punpckldq x1, x0; \
591 punpckhdq x1, t2; \
592 movdqa x2, t1; \
593 punpckhdq x3, x2; \
594 punpckldq x3, t1; \
595 movdqa x0, x1; \
596 punpcklqdq t1, x0; \
597 punpckhqdq t1, x1; \
598 movdqa t2, x3; \
599 punpcklqdq x2, t2; \
600 punpckhqdq x2, x3; \
601 movdqa t2, x2;
602
603#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
604 movdqu (0*4*4)(in), x0; \
605 movdqu (1*4*4)(in), x1; \
606 movdqu (2*4*4)(in), x2; \
607 movdqu (3*4*4)(in), x3; \
608 \
609 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
610
611#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
612 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
613 \
614 movdqu x0, (0*4*4)(out); \
615 movdqu x1, (1*4*4)(out); \
616 movdqu x2, (2*4*4)(out); \
617 movdqu x3, (3*4*4)(out);
618
619#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
620 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
621 \
622 movdqu (0*4*4)(out), t0; \
623 pxor t0, x0; \
624 movdqu x0, (0*4*4)(out); \
625 movdqu (1*4*4)(out), t0; \
626 pxor t0, x1; \
627 movdqu x1, (1*4*4)(out); \
628 movdqu (2*4*4)(out), t0; \
629 pxor t0, x2; \
630 movdqu x2, (2*4*4)(out); \
631 movdqu (3*4*4)(out), t0; \
632 pxor t0, x3; \
633 movdqu x3, (3*4*4)(out);
634
635.align 8
636.global __serpent_enc_blk_8way
637.type __serpent_enc_blk_8way,@function;
638
639__serpent_enc_blk_8way:
640 /* input:
641 * %rdi: ctx, CTX
642 * %rsi: dst
643 * %rdx: src
644 * %rcx: bool, if true: xor output
645 */
646
647 pcmpeqd RNOT, RNOT;
648
649 leaq (4*4*4)(%rdx), %rax;
650 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
651 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
652
653 K2(RA, RB, RC, RD, RE, 0);
654 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
655 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
656 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
657 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
658 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
659 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
660 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
661 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
662 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
663 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
664 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
665 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
666 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
667 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
668 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
669 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
670 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
671 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
672 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
673 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
674 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
675 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
676 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
677 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
678 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
679 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
680 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
681 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
682 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
683 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
684 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
685 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
686
687 leaq (4*4*4)(%rsi), %rax;
688
689 testb %cl, %cl;
690 jnz __enc_xor8;
691
692 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
693 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
694
695 ret;
696
697__enc_xor8:
698 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
699 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
700
701 ret;
702
703.align 8
704.global serpent_dec_blk_8way
705.type serpent_dec_blk_8way,@function;
706
707serpent_dec_blk_8way:
708 /* input:
709 * %rdi: ctx, CTX
710 * %rsi: dst
711 * %rdx: src
712 */
713
714 pcmpeqd RNOT, RNOT;
715
716 leaq (4*4*4)(%rdx), %rax;
717 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
718 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
719
720 K2(RA, RB, RC, RD, RE, 32);
721 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
722 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
723 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
724 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
725 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
726 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
727 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
728 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
729 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
730 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
731 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
732 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
733 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
734 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
735 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
736 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
737 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
738 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
739 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
740 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
741 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
742 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
743 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
744 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
745 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
746 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
747 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
748 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
749 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
750 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
751 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
752 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
753
754 leaq (4*4*4)(%rsi), %rax;
755 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
756 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
757
758 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
deleted file mode 100644
index 52abaaf28e7..00000000000
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ /dev/null
@@ -1,595 +0,0 @@
1/*
2 * Glue Code for AVX assembler versions of Serpent Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Glue code based on serpent_sse2_glue.c by:
8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/hardirq.h>
29#include <linux/types.h>
30#include <linux/crypto.h>
31#include <linux/err.h>
32#include <crypto/algapi.h>
33#include <crypto/serpent.h>
34#include <crypto/cryptd.h>
35#include <crypto/b128ops.h>
36#include <crypto/ctr.h>
37#include <crypto/lrw.h>
38#include <crypto/xts.h>
39#include <asm/xcr.h>
40#include <asm/xsave.h>
41#include <asm/crypto/serpent-avx.h>
42#include <asm/crypto/ablk_helper.h>
43#include <asm/crypto/glue_helper.h>
44
45static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
46{
47 be128 ctrblk;
48
49 le128_to_be128(&ctrblk, iv);
50 le128_inc(iv);
51
52 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
53 u128_xor(dst, src, (u128 *)&ctrblk);
54}
55
56static const struct common_glue_ctx serpent_enc = {
57 .num_funcs = 2,
58 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
59
60 .funcs = { {
61 .num_blocks = SERPENT_PARALLEL_BLOCKS,
62 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
63 }, {
64 .num_blocks = 1,
65 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
66 } }
67};
68
69static const struct common_glue_ctx serpent_ctr = {
70 .num_funcs = 2,
71 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
72
73 .funcs = { {
74 .num_blocks = SERPENT_PARALLEL_BLOCKS,
75 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
76 }, {
77 .num_blocks = 1,
78 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
79 } }
80};
81
82static const struct common_glue_ctx serpent_dec = {
83 .num_funcs = 2,
84 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
85
86 .funcs = { {
87 .num_blocks = SERPENT_PARALLEL_BLOCKS,
88 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
89 }, {
90 .num_blocks = 1,
91 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
92 } }
93};
94
95static const struct common_glue_ctx serpent_dec_cbc = {
96 .num_funcs = 2,
97 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
98
99 .funcs = { {
100 .num_blocks = SERPENT_PARALLEL_BLOCKS,
101 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
102 }, {
103 .num_blocks = 1,
104 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
105 } }
106};
107
108static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
109 struct scatterlist *src, unsigned int nbytes)
110{
111 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
112}
113
114static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
115 struct scatterlist *src, unsigned int nbytes)
116{
117 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
118}
119
120static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
121 struct scatterlist *src, unsigned int nbytes)
122{
123 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
124 dst, src, nbytes);
125}
126
127static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
128 struct scatterlist *src, unsigned int nbytes)
129{
130 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
131 nbytes);
132}
133
134static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
135 struct scatterlist *src, unsigned int nbytes)
136{
137 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
138}
139
140static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
141{
142 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
143 NULL, fpu_enabled, nbytes);
144}
145
146static inline void serpent_fpu_end(bool fpu_enabled)
147{
148 glue_fpu_end(fpu_enabled);
149}
150
151struct crypt_priv {
152 struct serpent_ctx *ctx;
153 bool fpu_enabled;
154};
155
156static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
157{
158 const unsigned int bsize = SERPENT_BLOCK_SIZE;
159 struct crypt_priv *ctx = priv;
160 int i;
161
162 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
163
164 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
165 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
166 return;
167 }
168
169 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
170 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
171}
172
173static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
174{
175 const unsigned int bsize = SERPENT_BLOCK_SIZE;
176 struct crypt_priv *ctx = priv;
177 int i;
178
179 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
180
181 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
182 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
183 return;
184 }
185
186 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
187 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
188}
189
190struct serpent_lrw_ctx {
191 struct lrw_table_ctx lrw_table;
192 struct serpent_ctx serpent_ctx;
193};
194
195static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
196 unsigned int keylen)
197{
198 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
199 int err;
200
201 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
202 SERPENT_BLOCK_SIZE);
203 if (err)
204 return err;
205
206 return lrw_init_table(&ctx->lrw_table, key + keylen -
207 SERPENT_BLOCK_SIZE);
208}
209
210static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
211 struct scatterlist *src, unsigned int nbytes)
212{
213 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
214 be128 buf[SERPENT_PARALLEL_BLOCKS];
215 struct crypt_priv crypt_ctx = {
216 .ctx = &ctx->serpent_ctx,
217 .fpu_enabled = false,
218 };
219 struct lrw_crypt_req req = {
220 .tbuf = buf,
221 .tbuflen = sizeof(buf),
222
223 .table_ctx = &ctx->lrw_table,
224 .crypt_ctx = &crypt_ctx,
225 .crypt_fn = encrypt_callback,
226 };
227 int ret;
228
229 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
230 ret = lrw_crypt(desc, dst, src, nbytes, &req);
231 serpent_fpu_end(crypt_ctx.fpu_enabled);
232
233 return ret;
234}
235
236static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
237 struct scatterlist *src, unsigned int nbytes)
238{
239 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
240 be128 buf[SERPENT_PARALLEL_BLOCKS];
241 struct crypt_priv crypt_ctx = {
242 .ctx = &ctx->serpent_ctx,
243 .fpu_enabled = false,
244 };
245 struct lrw_crypt_req req = {
246 .tbuf = buf,
247 .tbuflen = sizeof(buf),
248
249 .table_ctx = &ctx->lrw_table,
250 .crypt_ctx = &crypt_ctx,
251 .crypt_fn = decrypt_callback,
252 };
253 int ret;
254
255 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
256 ret = lrw_crypt(desc, dst, src, nbytes, &req);
257 serpent_fpu_end(crypt_ctx.fpu_enabled);
258
259 return ret;
260}
261
262static void lrw_exit_tfm(struct crypto_tfm *tfm)
263{
264 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
265
266 lrw_free_table(&ctx->lrw_table);
267}
268
269struct serpent_xts_ctx {
270 struct serpent_ctx tweak_ctx;
271 struct serpent_ctx crypt_ctx;
272};
273
274static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
275 unsigned int keylen)
276{
277 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
278 u32 *flags = &tfm->crt_flags;
279 int err;
280
281 /* key consists of keys of equal size concatenated, therefore
282 * the length must be even
283 */
284 if (keylen % 2) {
285 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
286 return -EINVAL;
287 }
288
289 /* first half of xts-key is for crypt */
290 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
291 if (err)
292 return err;
293
294 /* second half of xts-key is for tweak */
295 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
296}
297
298static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
299 struct scatterlist *src, unsigned int nbytes)
300{
301 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
302 be128 buf[SERPENT_PARALLEL_BLOCKS];
303 struct crypt_priv crypt_ctx = {
304 .ctx = &ctx->crypt_ctx,
305 .fpu_enabled = false,
306 };
307 struct xts_crypt_req req = {
308 .tbuf = buf,
309 .tbuflen = sizeof(buf),
310
311 .tweak_ctx = &ctx->tweak_ctx,
312 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
313 .crypt_ctx = &crypt_ctx,
314 .crypt_fn = encrypt_callback,
315 };
316 int ret;
317
318 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
319 ret = xts_crypt(desc, dst, src, nbytes, &req);
320 serpent_fpu_end(crypt_ctx.fpu_enabled);
321
322 return ret;
323}
324
325static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
326 struct scatterlist *src, unsigned int nbytes)
327{
328 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
329 be128 buf[SERPENT_PARALLEL_BLOCKS];
330 struct crypt_priv crypt_ctx = {
331 .ctx = &ctx->crypt_ctx,
332 .fpu_enabled = false,
333 };
334 struct xts_crypt_req req = {
335 .tbuf = buf,
336 .tbuflen = sizeof(buf),
337
338 .tweak_ctx = &ctx->tweak_ctx,
339 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
340 .crypt_ctx = &crypt_ctx,
341 .crypt_fn = decrypt_callback,
342 };
343 int ret;
344
345 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
346 ret = xts_crypt(desc, dst, src, nbytes, &req);
347 serpent_fpu_end(crypt_ctx.fpu_enabled);
348
349 return ret;
350}
351
352static struct crypto_alg serpent_algs[10] = { {
353 .cra_name = "__ecb-serpent-avx",
354 .cra_driver_name = "__driver-ecb-serpent-avx",
355 .cra_priority = 0,
356 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
357 .cra_blocksize = SERPENT_BLOCK_SIZE,
358 .cra_ctxsize = sizeof(struct serpent_ctx),
359 .cra_alignmask = 0,
360 .cra_type = &crypto_blkcipher_type,
361 .cra_module = THIS_MODULE,
362 .cra_u = {
363 .blkcipher = {
364 .min_keysize = SERPENT_MIN_KEY_SIZE,
365 .max_keysize = SERPENT_MAX_KEY_SIZE,
366 .setkey = serpent_setkey,
367 .encrypt = ecb_encrypt,
368 .decrypt = ecb_decrypt,
369 },
370 },
371}, {
372 .cra_name = "__cbc-serpent-avx",
373 .cra_driver_name = "__driver-cbc-serpent-avx",
374 .cra_priority = 0,
375 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
376 .cra_blocksize = SERPENT_BLOCK_SIZE,
377 .cra_ctxsize = sizeof(struct serpent_ctx),
378 .cra_alignmask = 0,
379 .cra_type = &crypto_blkcipher_type,
380 .cra_module = THIS_MODULE,
381 .cra_u = {
382 .blkcipher = {
383 .min_keysize = SERPENT_MIN_KEY_SIZE,
384 .max_keysize = SERPENT_MAX_KEY_SIZE,
385 .setkey = serpent_setkey,
386 .encrypt = cbc_encrypt,
387 .decrypt = cbc_decrypt,
388 },
389 },
390}, {
391 .cra_name = "__ctr-serpent-avx",
392 .cra_driver_name = "__driver-ctr-serpent-avx",
393 .cra_priority = 0,
394 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
395 .cra_blocksize = 1,
396 .cra_ctxsize = sizeof(struct serpent_ctx),
397 .cra_alignmask = 0,
398 .cra_type = &crypto_blkcipher_type,
399 .cra_module = THIS_MODULE,
400 .cra_u = {
401 .blkcipher = {
402 .min_keysize = SERPENT_MIN_KEY_SIZE,
403 .max_keysize = SERPENT_MAX_KEY_SIZE,
404 .ivsize = SERPENT_BLOCK_SIZE,
405 .setkey = serpent_setkey,
406 .encrypt = ctr_crypt,
407 .decrypt = ctr_crypt,
408 },
409 },
410}, {
411 .cra_name = "__lrw-serpent-avx",
412 .cra_driver_name = "__driver-lrw-serpent-avx",
413 .cra_priority = 0,
414 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
415 .cra_blocksize = SERPENT_BLOCK_SIZE,
416 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
417 .cra_alignmask = 0,
418 .cra_type = &crypto_blkcipher_type,
419 .cra_module = THIS_MODULE,
420 .cra_exit = lrw_exit_tfm,
421 .cra_u = {
422 .blkcipher = {
423 .min_keysize = SERPENT_MIN_KEY_SIZE +
424 SERPENT_BLOCK_SIZE,
425 .max_keysize = SERPENT_MAX_KEY_SIZE +
426 SERPENT_BLOCK_SIZE,
427 .ivsize = SERPENT_BLOCK_SIZE,
428 .setkey = lrw_serpent_setkey,
429 .encrypt = lrw_encrypt,
430 .decrypt = lrw_decrypt,
431 },
432 },
433}, {
434 .cra_name = "__xts-serpent-avx",
435 .cra_driver_name = "__driver-xts-serpent-avx",
436 .cra_priority = 0,
437 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
438 .cra_blocksize = SERPENT_BLOCK_SIZE,
439 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
440 .cra_alignmask = 0,
441 .cra_type = &crypto_blkcipher_type,
442 .cra_module = THIS_MODULE,
443 .cra_u = {
444 .blkcipher = {
445 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
446 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
447 .ivsize = SERPENT_BLOCK_SIZE,
448 .setkey = xts_serpent_setkey,
449 .encrypt = xts_encrypt,
450 .decrypt = xts_decrypt,
451 },
452 },
453}, {
454 .cra_name = "ecb(serpent)",
455 .cra_driver_name = "ecb-serpent-avx",
456 .cra_priority = 500,
457 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
458 .cra_blocksize = SERPENT_BLOCK_SIZE,
459 .cra_ctxsize = sizeof(struct async_helper_ctx),
460 .cra_alignmask = 0,
461 .cra_type = &crypto_ablkcipher_type,
462 .cra_module = THIS_MODULE,
463 .cra_init = ablk_init,
464 .cra_exit = ablk_exit,
465 .cra_u = {
466 .ablkcipher = {
467 .min_keysize = SERPENT_MIN_KEY_SIZE,
468 .max_keysize = SERPENT_MAX_KEY_SIZE,
469 .setkey = ablk_set_key,
470 .encrypt = ablk_encrypt,
471 .decrypt = ablk_decrypt,
472 },
473 },
474}, {
475 .cra_name = "cbc(serpent)",
476 .cra_driver_name = "cbc-serpent-avx",
477 .cra_priority = 500,
478 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
479 .cra_blocksize = SERPENT_BLOCK_SIZE,
480 .cra_ctxsize = sizeof(struct async_helper_ctx),
481 .cra_alignmask = 0,
482 .cra_type = &crypto_ablkcipher_type,
483 .cra_module = THIS_MODULE,
484 .cra_init = ablk_init,
485 .cra_exit = ablk_exit,
486 .cra_u = {
487 .ablkcipher = {
488 .min_keysize = SERPENT_MIN_KEY_SIZE,
489 .max_keysize = SERPENT_MAX_KEY_SIZE,
490 .ivsize = SERPENT_BLOCK_SIZE,
491 .setkey = ablk_set_key,
492 .encrypt = __ablk_encrypt,
493 .decrypt = ablk_decrypt,
494 },
495 },
496}, {
497 .cra_name = "ctr(serpent)",
498 .cra_driver_name = "ctr-serpent-avx",
499 .cra_priority = 500,
500 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
501 .cra_blocksize = 1,
502 .cra_ctxsize = sizeof(struct async_helper_ctx),
503 .cra_alignmask = 0,
504 .cra_type = &crypto_ablkcipher_type,
505 .cra_module = THIS_MODULE,
506 .cra_init = ablk_init,
507 .cra_exit = ablk_exit,
508 .cra_u = {
509 .ablkcipher = {
510 .min_keysize = SERPENT_MIN_KEY_SIZE,
511 .max_keysize = SERPENT_MAX_KEY_SIZE,
512 .ivsize = SERPENT_BLOCK_SIZE,
513 .setkey = ablk_set_key,
514 .encrypt = ablk_encrypt,
515 .decrypt = ablk_encrypt,
516 .geniv = "chainiv",
517 },
518 },
519}, {
520 .cra_name = "lrw(serpent)",
521 .cra_driver_name = "lrw-serpent-avx",
522 .cra_priority = 500,
523 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
524 .cra_blocksize = SERPENT_BLOCK_SIZE,
525 .cra_ctxsize = sizeof(struct async_helper_ctx),
526 .cra_alignmask = 0,
527 .cra_type = &crypto_ablkcipher_type,
528 .cra_module = THIS_MODULE,
529 .cra_init = ablk_init,
530 .cra_exit = ablk_exit,
531 .cra_u = {
532 .ablkcipher = {
533 .min_keysize = SERPENT_MIN_KEY_SIZE +
534 SERPENT_BLOCK_SIZE,
535 .max_keysize = SERPENT_MAX_KEY_SIZE +
536 SERPENT_BLOCK_SIZE,
537 .ivsize = SERPENT_BLOCK_SIZE,
538 .setkey = ablk_set_key,
539 .encrypt = ablk_encrypt,
540 .decrypt = ablk_decrypt,
541 },
542 },
543}, {
544 .cra_name = "xts(serpent)",
545 .cra_driver_name = "xts-serpent-avx",
546 .cra_priority = 500,
547 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
548 .cra_blocksize = SERPENT_BLOCK_SIZE,
549 .cra_ctxsize = sizeof(struct async_helper_ctx),
550 .cra_alignmask = 0,
551 .cra_type = &crypto_ablkcipher_type,
552 .cra_module = THIS_MODULE,
553 .cra_init = ablk_init,
554 .cra_exit = ablk_exit,
555 .cra_u = {
556 .ablkcipher = {
557 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
558 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
559 .ivsize = SERPENT_BLOCK_SIZE,
560 .setkey = ablk_set_key,
561 .encrypt = ablk_encrypt,
562 .decrypt = ablk_decrypt,
563 },
564 },
565} };
566
567static int __init serpent_init(void)
568{
569 u64 xcr0;
570
571 if (!cpu_has_avx || !cpu_has_osxsave) {
572 printk(KERN_INFO "AVX instructions are not detected.\n");
573 return -ENODEV;
574 }
575
576 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
577 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
578 printk(KERN_INFO "AVX detected but unusable.\n");
579 return -ENODEV;
580 }
581
582 return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
583}
584
585static void __exit serpent_exit(void)
586{
587 crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
588}
589
590module_init(serpent_init);
591module_exit(serpent_exit);
592
593MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
594MODULE_LICENSE("GPL");
595MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
deleted file mode 100644
index 97a356ece24..00000000000
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ /dev/null
@@ -1,621 +0,0 @@
1/*
2 * Glue Code for SSE2 assembler versions of Serpent Cipher
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Glue code based on aesni-intel_glue.c by:
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 *
10 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
11 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
12 * CTR part based on code (crypto/ctr.c) by:
13 * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
28 * USA
29 *
30 */
31
32#include <linux/module.h>
33#include <linux/hardirq.h>
34#include <linux/types.h>
35#include <linux/crypto.h>
36#include <linux/err.h>
37#include <crypto/algapi.h>
38#include <crypto/serpent.h>
39#include <crypto/cryptd.h>
40#include <crypto/b128ops.h>
41#include <crypto/ctr.h>
42#include <crypto/lrw.h>
43#include <crypto/xts.h>
44#include <asm/crypto/serpent-sse2.h>
45#include <asm/crypto/ablk_helper.h>
46#include <asm/crypto/glue_helper.h>
47
48static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
49{
50 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
51 unsigned int j;
52
53 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
54 ivs[j] = src[j];
55
56 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
57
58 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
59 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
60}
61
62static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
63{
64 be128 ctrblk;
65
66 le128_to_be128(&ctrblk, iv);
67 le128_inc(iv);
68
69 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
70 u128_xor(dst, src, (u128 *)&ctrblk);
71}
72
73static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
74 le128 *iv)
75{
76 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
77 unsigned int i;
78
79 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
80 if (dst != src)
81 dst[i] = src[i];
82
83 le128_to_be128(&ctrblks[i], iv);
84 le128_inc(iv);
85 }
86
87 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
88}
89
90static const struct common_glue_ctx serpent_enc = {
91 .num_funcs = 2,
92 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
93
94 .funcs = { {
95 .num_blocks = SERPENT_PARALLEL_BLOCKS,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
97 }, {
98 .num_blocks = 1,
99 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
100 } }
101};
102
103static const struct common_glue_ctx serpent_ctr = {
104 .num_funcs = 2,
105 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
106
107 .funcs = { {
108 .num_blocks = SERPENT_PARALLEL_BLOCKS,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
110 }, {
111 .num_blocks = 1,
112 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
113 } }
114};
115
116static const struct common_glue_ctx serpent_dec = {
117 .num_funcs = 2,
118 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
119
120 .funcs = { {
121 .num_blocks = SERPENT_PARALLEL_BLOCKS,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
123 }, {
124 .num_blocks = 1,
125 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
126 } }
127};
128
129static const struct common_glue_ctx serpent_dec_cbc = {
130 .num_funcs = 2,
131 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
132
133 .funcs = { {
134 .num_blocks = SERPENT_PARALLEL_BLOCKS,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
136 }, {
137 .num_blocks = 1,
138 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
139 } }
140};
141
142static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
143 struct scatterlist *src, unsigned int nbytes)
144{
145 return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
146}
147
148static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
149 struct scatterlist *src, unsigned int nbytes)
150{
151 return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
152}
153
154static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
156{
157 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
158 dst, src, nbytes);
159}
160
161static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
162 struct scatterlist *src, unsigned int nbytes)
163{
164 return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
165 nbytes);
166}
167
168static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
169 struct scatterlist *src, unsigned int nbytes)
170{
171 return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
172}
173
174static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
175{
176 return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
177 NULL, fpu_enabled, nbytes);
178}
179
180static inline void serpent_fpu_end(bool fpu_enabled)
181{
182 glue_fpu_end(fpu_enabled);
183}
184
185struct crypt_priv {
186 struct serpent_ctx *ctx;
187 bool fpu_enabled;
188};
189
190static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
191{
192 const unsigned int bsize = SERPENT_BLOCK_SIZE;
193 struct crypt_priv *ctx = priv;
194 int i;
195
196 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
197
198 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
199 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
200 return;
201 }
202
203 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
204 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
205}
206
207static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
208{
209 const unsigned int bsize = SERPENT_BLOCK_SIZE;
210 struct crypt_priv *ctx = priv;
211 int i;
212
213 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
214
215 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
216 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
217 return;
218 }
219
220 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
221 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
222}
223
224struct serpent_lrw_ctx {
225 struct lrw_table_ctx lrw_table;
226 struct serpent_ctx serpent_ctx;
227};
228
229static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
230 unsigned int keylen)
231{
232 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
233 int err;
234
235 err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
236 SERPENT_BLOCK_SIZE);
237 if (err)
238 return err;
239
240 return lrw_init_table(&ctx->lrw_table, key + keylen -
241 SERPENT_BLOCK_SIZE);
242}
243
244static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
245 struct scatterlist *src, unsigned int nbytes)
246{
247 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
248 be128 buf[SERPENT_PARALLEL_BLOCKS];
249 struct crypt_priv crypt_ctx = {
250 .ctx = &ctx->serpent_ctx,
251 .fpu_enabled = false,
252 };
253 struct lrw_crypt_req req = {
254 .tbuf = buf,
255 .tbuflen = sizeof(buf),
256
257 .table_ctx = &ctx->lrw_table,
258 .crypt_ctx = &crypt_ctx,
259 .crypt_fn = encrypt_callback,
260 };
261 int ret;
262
263 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
264 ret = lrw_crypt(desc, dst, src, nbytes, &req);
265 serpent_fpu_end(crypt_ctx.fpu_enabled);
266
267 return ret;
268}
269
270static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
271 struct scatterlist *src, unsigned int nbytes)
272{
273 struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
274 be128 buf[SERPENT_PARALLEL_BLOCKS];
275 struct crypt_priv crypt_ctx = {
276 .ctx = &ctx->serpent_ctx,
277 .fpu_enabled = false,
278 };
279 struct lrw_crypt_req req = {
280 .tbuf = buf,
281 .tbuflen = sizeof(buf),
282
283 .table_ctx = &ctx->lrw_table,
284 .crypt_ctx = &crypt_ctx,
285 .crypt_fn = decrypt_callback,
286 };
287 int ret;
288
289 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
290 ret = lrw_crypt(desc, dst, src, nbytes, &req);
291 serpent_fpu_end(crypt_ctx.fpu_enabled);
292
293 return ret;
294}
295
296static void lrw_exit_tfm(struct crypto_tfm *tfm)
297{
298 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
299
300 lrw_free_table(&ctx->lrw_table);
301}
302
303struct serpent_xts_ctx {
304 struct serpent_ctx tweak_ctx;
305 struct serpent_ctx crypt_ctx;
306};
307
308static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
309 unsigned int keylen)
310{
311 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
312 u32 *flags = &tfm->crt_flags;
313 int err;
314
315 /* key consists of keys of equal size concatenated, therefore
316 * the length must be even
317 */
318 if (keylen % 2) {
319 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
320 return -EINVAL;
321 }
322
323 /* first half of xts-key is for crypt */
324 err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
325 if (err)
326 return err;
327
328 /* second half of xts-key is for tweak */
329 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
330}
331
332static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
333 struct scatterlist *src, unsigned int nbytes)
334{
335 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
336 be128 buf[SERPENT_PARALLEL_BLOCKS];
337 struct crypt_priv crypt_ctx = {
338 .ctx = &ctx->crypt_ctx,
339 .fpu_enabled = false,
340 };
341 struct xts_crypt_req req = {
342 .tbuf = buf,
343 .tbuflen = sizeof(buf),
344
345 .tweak_ctx = &ctx->tweak_ctx,
346 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
347 .crypt_ctx = &crypt_ctx,
348 .crypt_fn = encrypt_callback,
349 };
350 int ret;
351
352 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
353 ret = xts_crypt(desc, dst, src, nbytes, &req);
354 serpent_fpu_end(crypt_ctx.fpu_enabled);
355
356 return ret;
357}
358
359static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
360 struct scatterlist *src, unsigned int nbytes)
361{
362 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
363 be128 buf[SERPENT_PARALLEL_BLOCKS];
364 struct crypt_priv crypt_ctx = {
365 .ctx = &ctx->crypt_ctx,
366 .fpu_enabled = false,
367 };
368 struct xts_crypt_req req = {
369 .tbuf = buf,
370 .tbuflen = sizeof(buf),
371
372 .tweak_ctx = &ctx->tweak_ctx,
373 .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
374 .crypt_ctx = &crypt_ctx,
375 .crypt_fn = decrypt_callback,
376 };
377 int ret;
378
379 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
380 ret = xts_crypt(desc, dst, src, nbytes, &req);
381 serpent_fpu_end(crypt_ctx.fpu_enabled);
382
383 return ret;
384}
385
386static struct crypto_alg serpent_algs[10] = { {
387 .cra_name = "__ecb-serpent-sse2",
388 .cra_driver_name = "__driver-ecb-serpent-sse2",
389 .cra_priority = 0,
390 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
391 .cra_blocksize = SERPENT_BLOCK_SIZE,
392 .cra_ctxsize = sizeof(struct serpent_ctx),
393 .cra_alignmask = 0,
394 .cra_type = &crypto_blkcipher_type,
395 .cra_module = THIS_MODULE,
396 .cra_u = {
397 .blkcipher = {
398 .min_keysize = SERPENT_MIN_KEY_SIZE,
399 .max_keysize = SERPENT_MAX_KEY_SIZE,
400 .setkey = serpent_setkey,
401 .encrypt = ecb_encrypt,
402 .decrypt = ecb_decrypt,
403 },
404 },
405}, {
406 .cra_name = "__cbc-serpent-sse2",
407 .cra_driver_name = "__driver-cbc-serpent-sse2",
408 .cra_priority = 0,
409 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
410 .cra_blocksize = SERPENT_BLOCK_SIZE,
411 .cra_ctxsize = sizeof(struct serpent_ctx),
412 .cra_alignmask = 0,
413 .cra_type = &crypto_blkcipher_type,
414 .cra_module = THIS_MODULE,
415 .cra_u = {
416 .blkcipher = {
417 .min_keysize = SERPENT_MIN_KEY_SIZE,
418 .max_keysize = SERPENT_MAX_KEY_SIZE,
419 .setkey = serpent_setkey,
420 .encrypt = cbc_encrypt,
421 .decrypt = cbc_decrypt,
422 },
423 },
424}, {
425 .cra_name = "__ctr-serpent-sse2",
426 .cra_driver_name = "__driver-ctr-serpent-sse2",
427 .cra_priority = 0,
428 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
429 .cra_blocksize = 1,
430 .cra_ctxsize = sizeof(struct serpent_ctx),
431 .cra_alignmask = 0,
432 .cra_type = &crypto_blkcipher_type,
433 .cra_module = THIS_MODULE,
434 .cra_u = {
435 .blkcipher = {
436 .min_keysize = SERPENT_MIN_KEY_SIZE,
437 .max_keysize = SERPENT_MAX_KEY_SIZE,
438 .ivsize = SERPENT_BLOCK_SIZE,
439 .setkey = serpent_setkey,
440 .encrypt = ctr_crypt,
441 .decrypt = ctr_crypt,
442 },
443 },
444}, {
445 .cra_name = "__lrw-serpent-sse2",
446 .cra_driver_name = "__driver-lrw-serpent-sse2",
447 .cra_priority = 0,
448 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
449 .cra_blocksize = SERPENT_BLOCK_SIZE,
450 .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
451 .cra_alignmask = 0,
452 .cra_type = &crypto_blkcipher_type,
453 .cra_module = THIS_MODULE,
454 .cra_exit = lrw_exit_tfm,
455 .cra_u = {
456 .blkcipher = {
457 .min_keysize = SERPENT_MIN_KEY_SIZE +
458 SERPENT_BLOCK_SIZE,
459 .max_keysize = SERPENT_MAX_KEY_SIZE +
460 SERPENT_BLOCK_SIZE,
461 .ivsize = SERPENT_BLOCK_SIZE,
462 .setkey = lrw_serpent_setkey,
463 .encrypt = lrw_encrypt,
464 .decrypt = lrw_decrypt,
465 },
466 },
467}, {
468 .cra_name = "__xts-serpent-sse2",
469 .cra_driver_name = "__driver-xts-serpent-sse2",
470 .cra_priority = 0,
471 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
472 .cra_blocksize = SERPENT_BLOCK_SIZE,
473 .cra_ctxsize = sizeof(struct serpent_xts_ctx),
474 .cra_alignmask = 0,
475 .cra_type = &crypto_blkcipher_type,
476 .cra_module = THIS_MODULE,
477 .cra_u = {
478 .blkcipher = {
479 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
480 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
481 .ivsize = SERPENT_BLOCK_SIZE,
482 .setkey = xts_serpent_setkey,
483 .encrypt = xts_encrypt,
484 .decrypt = xts_decrypt,
485 },
486 },
487}, {
488 .cra_name = "ecb(serpent)",
489 .cra_driver_name = "ecb-serpent-sse2",
490 .cra_priority = 400,
491 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
492 .cra_blocksize = SERPENT_BLOCK_SIZE,
493 .cra_ctxsize = sizeof(struct async_helper_ctx),
494 .cra_alignmask = 0,
495 .cra_type = &crypto_ablkcipher_type,
496 .cra_module = THIS_MODULE,
497 .cra_init = ablk_init,
498 .cra_exit = ablk_exit,
499 .cra_u = {
500 .ablkcipher = {
501 .min_keysize = SERPENT_MIN_KEY_SIZE,
502 .max_keysize = SERPENT_MAX_KEY_SIZE,
503 .setkey = ablk_set_key,
504 .encrypt = ablk_encrypt,
505 .decrypt = ablk_decrypt,
506 },
507 },
508}, {
509 .cra_name = "cbc(serpent)",
510 .cra_driver_name = "cbc-serpent-sse2",
511 .cra_priority = 400,
512 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
513 .cra_blocksize = SERPENT_BLOCK_SIZE,
514 .cra_ctxsize = sizeof(struct async_helper_ctx),
515 .cra_alignmask = 0,
516 .cra_type = &crypto_ablkcipher_type,
517 .cra_module = THIS_MODULE,
518 .cra_init = ablk_init,
519 .cra_exit = ablk_exit,
520 .cra_u = {
521 .ablkcipher = {
522 .min_keysize = SERPENT_MIN_KEY_SIZE,
523 .max_keysize = SERPENT_MAX_KEY_SIZE,
524 .ivsize = SERPENT_BLOCK_SIZE,
525 .setkey = ablk_set_key,
526 .encrypt = __ablk_encrypt,
527 .decrypt = ablk_decrypt,
528 },
529 },
530}, {
531 .cra_name = "ctr(serpent)",
532 .cra_driver_name = "ctr-serpent-sse2",
533 .cra_priority = 400,
534 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
535 .cra_blocksize = 1,
536 .cra_ctxsize = sizeof(struct async_helper_ctx),
537 .cra_alignmask = 0,
538 .cra_type = &crypto_ablkcipher_type,
539 .cra_module = THIS_MODULE,
540 .cra_init = ablk_init,
541 .cra_exit = ablk_exit,
542 .cra_u = {
543 .ablkcipher = {
544 .min_keysize = SERPENT_MIN_KEY_SIZE,
545 .max_keysize = SERPENT_MAX_KEY_SIZE,
546 .ivsize = SERPENT_BLOCK_SIZE,
547 .setkey = ablk_set_key,
548 .encrypt = ablk_encrypt,
549 .decrypt = ablk_encrypt,
550 .geniv = "chainiv",
551 },
552 },
553}, {
554 .cra_name = "lrw(serpent)",
555 .cra_driver_name = "lrw-serpent-sse2",
556 .cra_priority = 400,
557 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
558 .cra_blocksize = SERPENT_BLOCK_SIZE,
559 .cra_ctxsize = sizeof(struct async_helper_ctx),
560 .cra_alignmask = 0,
561 .cra_type = &crypto_ablkcipher_type,
562 .cra_module = THIS_MODULE,
563 .cra_init = ablk_init,
564 .cra_exit = ablk_exit,
565 .cra_u = {
566 .ablkcipher = {
567 .min_keysize = SERPENT_MIN_KEY_SIZE +
568 SERPENT_BLOCK_SIZE,
569 .max_keysize = SERPENT_MAX_KEY_SIZE +
570 SERPENT_BLOCK_SIZE,
571 .ivsize = SERPENT_BLOCK_SIZE,
572 .setkey = ablk_set_key,
573 .encrypt = ablk_encrypt,
574 .decrypt = ablk_decrypt,
575 },
576 },
577}, {
578 .cra_name = "xts(serpent)",
579 .cra_driver_name = "xts-serpent-sse2",
580 .cra_priority = 400,
581 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
582 .cra_blocksize = SERPENT_BLOCK_SIZE,
583 .cra_ctxsize = sizeof(struct async_helper_ctx),
584 .cra_alignmask = 0,
585 .cra_type = &crypto_ablkcipher_type,
586 .cra_module = THIS_MODULE,
587 .cra_init = ablk_init,
588 .cra_exit = ablk_exit,
589 .cra_u = {
590 .ablkcipher = {
591 .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
592 .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
593 .ivsize = SERPENT_BLOCK_SIZE,
594 .setkey = ablk_set_key,
595 .encrypt = ablk_encrypt,
596 .decrypt = ablk_decrypt,
597 },
598 },
599} };
600
601static int __init serpent_sse2_init(void)
602{
603 if (!cpu_has_xmm2) {
604 printk(KERN_INFO "SSE2 instructions are not detected.\n");
605 return -ENODEV;
606 }
607
608 return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
609}
610
611static void __exit serpent_sse2_exit(void)
612{
613 crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
614}
615
616module_init(serpent_sse2_init);
617module_exit(serpent_sse2_exit);
618
619MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
620MODULE_LICENSE("GPL");
621MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
deleted file mode 100644
index 49d6987a73d..00000000000
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ /dev/null
@@ -1,558 +0,0 @@
1/*
2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
5 * boost.
6 *
7 * This work was inspired by the vectorized implementation of Dean Gaudet.
8 * Additional information on it can be found at:
9 * http://www.arctic.org/~dean/crypto/sha1.html
10 *
11 * It was improved upon with more efficient vectorization of the message
12 * scheduling. This implementation has also been optimized for all current and
13 * several future generations of Intel CPUs.
14 *
15 * See this article for more information about the implementation details:
16 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
17 *
18 * Copyright (C) 2010, Intel Corp.
19 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
20 * Ronen Zohar <ronen.zohar@intel.com>
21 *
22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
23 * Author: Mathias Krause <minipli@googlemail.com>
24 *
25 * This program is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published by
27 * the Free Software Foundation; either version 2 of the License, or
28 * (at your option) any later version.
29 */
30
31#define CTX %rdi // arg1
32#define BUF %rsi // arg2
33#define CNT %rdx // arg3
34
35#define REG_A %ecx
36#define REG_B %esi
37#define REG_C %edi
38#define REG_D %ebp
39#define REG_E %edx
40
41#define REG_T1 %eax
42#define REG_T2 %ebx
43
44#define K_BASE %r8
45#define HASH_PTR %r9
46#define BUFFER_PTR %r10
47#define BUFFER_END %r11
48
49#define W_TMP1 %xmm0
50#define W_TMP2 %xmm9
51
52#define W0 %xmm1
53#define W4 %xmm2
54#define W8 %xmm3
55#define W12 %xmm4
56#define W16 %xmm5
57#define W20 %xmm6
58#define W24 %xmm7
59#define W28 %xmm8
60
61#define XMM_SHUFB_BSWAP %xmm10
62
63/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
64#define WK(t) (((t) & 15) * 4)(%rsp)
65#define W_PRECALC_AHEAD 16
66
67/*
68 * This macro implements the SHA-1 function's body for single 64-byte block
69 * param: function's name
70 */
71.macro SHA1_VECTOR_ASM name
72 .global \name
73 .type \name, @function
74 .align 32
75\name:
76 push %rbx
77 push %rbp
78 push %r12
79
80 mov %rsp, %r12
81 sub $64, %rsp # allocate workspace
82 and $~15, %rsp # align stack
83
84 mov CTX, HASH_PTR
85 mov BUF, BUFFER_PTR
86
87 shl $6, CNT # multiply by 64
88 add BUF, CNT
89 mov CNT, BUFFER_END
90
91 lea K_XMM_AR(%rip), K_BASE
92 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
93
94 SHA1_PIPELINED_MAIN_BODY
95
96 # cleanup workspace
97 mov $8, %ecx
98 mov %rsp, %rdi
99 xor %rax, %rax
100 rep stosq
101
102 mov %r12, %rsp # deallocate workspace
103
104 pop %r12
105 pop %rbp
106 pop %rbx
107 ret
108
109 .size \name, .-\name
110.endm
111
112/*
113 * This macro implements 80 rounds of SHA-1 for one 64-byte block
114 */
115.macro SHA1_PIPELINED_MAIN_BODY
116 INIT_REGALLOC
117
118 mov (HASH_PTR), A
119 mov 4(HASH_PTR), B
120 mov 8(HASH_PTR), C
121 mov 12(HASH_PTR), D
122 mov 16(HASH_PTR), E
123
124 .set i, 0
125 .rept W_PRECALC_AHEAD
126 W_PRECALC i
127 .set i, (i+1)
128 .endr
129
130.align 4
1311:
132 RR F1,A,B,C,D,E,0
133 RR F1,D,E,A,B,C,2
134 RR F1,B,C,D,E,A,4
135 RR F1,E,A,B,C,D,6
136 RR F1,C,D,E,A,B,8
137
138 RR F1,A,B,C,D,E,10
139 RR F1,D,E,A,B,C,12
140 RR F1,B,C,D,E,A,14
141 RR F1,E,A,B,C,D,16
142 RR F1,C,D,E,A,B,18
143
144 RR F2,A,B,C,D,E,20
145 RR F2,D,E,A,B,C,22
146 RR F2,B,C,D,E,A,24
147 RR F2,E,A,B,C,D,26
148 RR F2,C,D,E,A,B,28
149
150 RR F2,A,B,C,D,E,30
151 RR F2,D,E,A,B,C,32
152 RR F2,B,C,D,E,A,34
153 RR F2,E,A,B,C,D,36
154 RR F2,C,D,E,A,B,38
155
156 RR F3,A,B,C,D,E,40
157 RR F3,D,E,A,B,C,42
158 RR F3,B,C,D,E,A,44
159 RR F3,E,A,B,C,D,46
160 RR F3,C,D,E,A,B,48
161
162 RR F3,A,B,C,D,E,50
163 RR F3,D,E,A,B,C,52
164 RR F3,B,C,D,E,A,54
165 RR F3,E,A,B,C,D,56
166 RR F3,C,D,E,A,B,58
167
168 add $64, BUFFER_PTR # move to the next 64-byte block
169 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
170 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
171
172 RR F4,A,B,C,D,E,60
173 RR F4,D,E,A,B,C,62
174 RR F4,B,C,D,E,A,64
175 RR F4,E,A,B,C,D,66
176 RR F4,C,D,E,A,B,68
177
178 RR F4,A,B,C,D,E,70
179 RR F4,D,E,A,B,C,72
180 RR F4,B,C,D,E,A,74
181 RR F4,E,A,B,C,D,76
182 RR F4,C,D,E,A,B,78
183
184 UPDATE_HASH (HASH_PTR), A
185 UPDATE_HASH 4(HASH_PTR), B
186 UPDATE_HASH 8(HASH_PTR), C
187 UPDATE_HASH 12(HASH_PTR), D
188 UPDATE_HASH 16(HASH_PTR), E
189
190 RESTORE_RENAMED_REGS
191 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
192 jne 1b
193.endm
194
195.macro INIT_REGALLOC
196 .set A, REG_A
197 .set B, REG_B
198 .set C, REG_C
199 .set D, REG_D
200 .set E, REG_E
201 .set T1, REG_T1
202 .set T2, REG_T2
203.endm
204
205.macro RESTORE_RENAMED_REGS
206 # order is important (REG_C is where it should be)
207 mov B, REG_B
208 mov D, REG_D
209 mov A, REG_A
210 mov E, REG_E
211.endm
212
213.macro SWAP_REG_NAMES a, b
214 .set _T, \a
215 .set \a, \b
216 .set \b, _T
217.endm
218
219.macro F1 b, c, d
220 mov \c, T1
221 SWAP_REG_NAMES \c, T1
222 xor \d, T1
223 and \b, T1
224 xor \d, T1
225.endm
226
227.macro F2 b, c, d
228 mov \d, T1
229 SWAP_REG_NAMES \d, T1
230 xor \c, T1
231 xor \b, T1
232.endm
233
234.macro F3 b, c ,d
235 mov \c, T1
236 SWAP_REG_NAMES \c, T1
237 mov \b, T2
238 or \b, T1
239 and \c, T2
240 and \d, T1
241 or T2, T1
242.endm
243
244.macro F4 b, c, d
245 F2 \b, \c, \d
246.endm
247
248.macro UPDATE_HASH hash, val
249 add \hash, \val
250 mov \val, \hash
251.endm
252
253/*
254 * RR does two rounds of SHA-1 back to back with W[] pre-calc
255 * t1 = F(b, c, d); e += w(i)
256 * e += t1; b <<= 30; d += w(i+1);
257 * t1 = F(a, b, c);
258 * d += t1; a <<= 5;
259 * e += a;
260 * t1 = e; a >>= 7;
261 * t1 <<= 5;
262 * d += t1;
263 */
264.macro RR F, a, b, c, d, e, round
265 add WK(\round), \e
266 \F \b, \c, \d # t1 = F(b, c, d);
267 W_PRECALC (\round + W_PRECALC_AHEAD)
268 rol $30, \b
269 add T1, \e
270 add WK(\round + 1), \d
271
272 \F \a, \b, \c
273 W_PRECALC (\round + W_PRECALC_AHEAD + 1)
274 rol $5, \a
275 add \a, \e
276 add T1, \d
277 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
278
279 mov \e, T1
280 SWAP_REG_NAMES \e, T1
281
282 rol $5, T1
283 add T1, \d
284
285 # write: \a, \b
286 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
287.endm
288
289.macro W_PRECALC r
290 .set i, \r
291
292 .if (i < 20)
293 .set K_XMM, 0
294 .elseif (i < 40)
295 .set K_XMM, 16
296 .elseif (i < 60)
297 .set K_XMM, 32
298 .elseif (i < 80)
299 .set K_XMM, 48
300 .endif
301
302 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
303 .set i, ((\r) % 80) # pre-compute for the next iteration
304 .if (i == 0)
305 W_PRECALC_RESET
306 .endif
307 W_PRECALC_00_15
308 .elseif (i<32)
309 W_PRECALC_16_31
310 .elseif (i < 80) // rounds 32-79
311 W_PRECALC_32_79
312 .endif
313.endm
314
315.macro W_PRECALC_RESET
316 .set W, W0
317 .set W_minus_04, W4
318 .set W_minus_08, W8
319 .set W_minus_12, W12
320 .set W_minus_16, W16
321 .set W_minus_20, W20
322 .set W_minus_24, W24
323 .set W_minus_28, W28
324 .set W_minus_32, W
325.endm
326
327.macro W_PRECALC_ROTATE
328 .set W_minus_32, W_minus_28
329 .set W_minus_28, W_minus_24
330 .set W_minus_24, W_minus_20
331 .set W_minus_20, W_minus_16
332 .set W_minus_16, W_minus_12
333 .set W_minus_12, W_minus_08
334 .set W_minus_08, W_minus_04
335 .set W_minus_04, W
336 .set W, W_minus_32
337.endm
338
339.macro W_PRECALC_SSSE3
340
341.macro W_PRECALC_00_15
342 W_PRECALC_00_15_SSSE3
343.endm
344.macro W_PRECALC_16_31
345 W_PRECALC_16_31_SSSE3
346.endm
347.macro W_PRECALC_32_79
348 W_PRECALC_32_79_SSSE3
349.endm
350
351/* message scheduling pre-compute for rounds 0-15 */
352.macro W_PRECALC_00_15_SSSE3
353 .if ((i & 3) == 0)
354 movdqu (i*4)(BUFFER_PTR), W_TMP1
355 .elseif ((i & 3) == 1)
356 pshufb XMM_SHUFB_BSWAP, W_TMP1
357 movdqa W_TMP1, W
358 .elseif ((i & 3) == 2)
359 paddd (K_BASE), W_TMP1
360 .elseif ((i & 3) == 3)
361 movdqa W_TMP1, WK(i&~3)
362 W_PRECALC_ROTATE
363 .endif
364.endm
365
366/* message scheduling pre-compute for rounds 16-31
367 *
368 * - calculating last 32 w[i] values in 8 XMM registers
369 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
370 * instruction
371 *
372 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
373 * dependency, but improves for 32-79
374 */
375.macro W_PRECALC_16_31_SSSE3
376 # blended scheduling of vector and scalar instruction streams, one 4-wide
377 # vector iteration / 4 scalar rounds
378 .if ((i & 3) == 0)
379 movdqa W_minus_12, W
380 palignr $8, W_minus_16, W # w[i-14]
381 movdqa W_minus_04, W_TMP1
382 psrldq $4, W_TMP1 # w[i-3]
383 pxor W_minus_08, W
384 .elseif ((i & 3) == 1)
385 pxor W_minus_16, W_TMP1
386 pxor W_TMP1, W
387 movdqa W, W_TMP2
388 movdqa W, W_TMP1
389 pslldq $12, W_TMP2
390 .elseif ((i & 3) == 2)
391 psrld $31, W
392 pslld $1, W_TMP1
393 por W, W_TMP1
394 movdqa W_TMP2, W
395 psrld $30, W_TMP2
396 pslld $2, W
397 .elseif ((i & 3) == 3)
398 pxor W, W_TMP1
399 pxor W_TMP2, W_TMP1
400 movdqa W_TMP1, W
401 paddd K_XMM(K_BASE), W_TMP1
402 movdqa W_TMP1, WK(i&~3)
403 W_PRECALC_ROTATE
404 .endif
405.endm
406
407/* message scheduling pre-compute for rounds 32-79
408 *
409 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
410 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
411 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
412 */
413.macro W_PRECALC_32_79_SSSE3
414 .if ((i & 3) == 0)
415 movdqa W_minus_04, W_TMP1
416 pxor W_minus_28, W # W is W_minus_32 before xor
417 palignr $8, W_minus_08, W_TMP1
418 .elseif ((i & 3) == 1)
419 pxor W_minus_16, W
420 pxor W_TMP1, W
421 movdqa W, W_TMP1
422 .elseif ((i & 3) == 2)
423 psrld $30, W
424 pslld $2, W_TMP1
425 por W, W_TMP1
426 .elseif ((i & 3) == 3)
427 movdqa W_TMP1, W
428 paddd K_XMM(K_BASE), W_TMP1
429 movdqa W_TMP1, WK(i&~3)
430 W_PRECALC_ROTATE
431 .endif
432.endm
433
434.endm // W_PRECALC_SSSE3
435
436
437#define K1 0x5a827999
438#define K2 0x6ed9eba1
439#define K3 0x8f1bbcdc
440#define K4 0xca62c1d6
441
442.section .rodata
443.align 16
444
445K_XMM_AR:
446 .long K1, K1, K1, K1
447 .long K2, K2, K2, K2
448 .long K3, K3, K3, K3
449 .long K4, K4, K4, K4
450
451BSWAP_SHUFB_CTL:
452 .long 0x00010203
453 .long 0x04050607
454 .long 0x08090a0b
455 .long 0x0c0d0e0f
456
457
458.section .text
459
460W_PRECALC_SSSE3
461.macro xmm_mov a, b
462 movdqu \a,\b
463.endm
464
465/* SSSE3 optimized implementation:
466 * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
467 * unsigned int rounds);
468 */
469SHA1_VECTOR_ASM sha1_transform_ssse3
470
471#ifdef CONFIG_AS_AVX
472
473.macro W_PRECALC_AVX
474
475.purgem W_PRECALC_00_15
476.macro W_PRECALC_00_15
477 W_PRECALC_00_15_AVX
478.endm
479.purgem W_PRECALC_16_31
480.macro W_PRECALC_16_31
481 W_PRECALC_16_31_AVX
482.endm
483.purgem W_PRECALC_32_79
484.macro W_PRECALC_32_79
485 W_PRECALC_32_79_AVX
486.endm
487
488.macro W_PRECALC_00_15_AVX
489 .if ((i & 3) == 0)
490 vmovdqu (i*4)(BUFFER_PTR), W_TMP1
491 .elseif ((i & 3) == 1)
492 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
493 .elseif ((i & 3) == 2)
494 vpaddd (K_BASE), W, W_TMP1
495 .elseif ((i & 3) == 3)
496 vmovdqa W_TMP1, WK(i&~3)
497 W_PRECALC_ROTATE
498 .endif
499.endm
500
501.macro W_PRECALC_16_31_AVX
502 .if ((i & 3) == 0)
503 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
504 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
505 vpxor W_minus_08, W, W
506 vpxor W_minus_16, W_TMP1, W_TMP1
507 .elseif ((i & 3) == 1)
508 vpxor W_TMP1, W, W
509 vpslldq $12, W, W_TMP2
510 vpslld $1, W, W_TMP1
511 .elseif ((i & 3) == 2)
512 vpsrld $31, W, W
513 vpor W, W_TMP1, W_TMP1
514 vpslld $2, W_TMP2, W
515 vpsrld $30, W_TMP2, W_TMP2
516 .elseif ((i & 3) == 3)
517 vpxor W, W_TMP1, W_TMP1
518 vpxor W_TMP2, W_TMP1, W
519 vpaddd K_XMM(K_BASE), W, W_TMP1
520 vmovdqu W_TMP1, WK(i&~3)
521 W_PRECALC_ROTATE
522 .endif
523.endm
524
525.macro W_PRECALC_32_79_AVX
526 .if ((i & 3) == 0)
527 vpalignr $8, W_minus_08, W_minus_04, W_TMP1
528 vpxor W_minus_28, W, W # W is W_minus_32 before xor
529 .elseif ((i & 3) == 1)
530 vpxor W_minus_16, W_TMP1, W_TMP1
531 vpxor W_TMP1, W, W
532 .elseif ((i & 3) == 2)
533 vpslld $2, W, W_TMP1
534 vpsrld $30, W, W
535 vpor W, W_TMP1, W
536 .elseif ((i & 3) == 3)
537 vpaddd K_XMM(K_BASE), W, W_TMP1
538 vmovdqu W_TMP1, WK(i&~3)
539 W_PRECALC_ROTATE
540 .endif
541.endm
542
543.endm // W_PRECALC_AVX
544
545W_PRECALC_AVX
546.purgem xmm_mov
547.macro xmm_mov a, b
548 vmovdqu \a,\b
549.endm
550
551
552/* AVX optimized implementation:
553 * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
554 * unsigned int rounds);
555 */
556SHA1_VECTOR_ASM sha1_transform_avx
557
558#endif
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
deleted file mode 100644
index 4a11a9d7245..00000000000
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ /dev/null
@@ -1,240 +0,0 @@
1/*
2 * Cryptographic API.
3 *
4 * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
5 * Supplemental SSE3 instructions.
6 *
7 * This file is based on sha1_generic.c
8 *
9 * Copyright (c) Alan Smithee.
10 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
11 * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
12 * Copyright (c) Mathias Krause <minipli@googlemail.com>
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option)
17 * any later version.
18 *
19 */
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <crypto/internal/hash.h>
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/mm.h>
27#include <linux/cryptohash.h>
28#include <linux/types.h>
29#include <crypto/sha.h>
30#include <asm/byteorder.h>
31#include <asm/i387.h>
32#include <asm/xcr.h>
33#include <asm/xsave.h>
34
35
36asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
37 unsigned int rounds);
38#ifdef CONFIG_AS_AVX
39asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
40 unsigned int rounds);
41#endif
42
43static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
44
45
46static int sha1_ssse3_init(struct shash_desc *desc)
47{
48 struct sha1_state *sctx = shash_desc_ctx(desc);
49
50 *sctx = (struct sha1_state){
51 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
52 };
53
54 return 0;
55}
56
57static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
58 unsigned int len, unsigned int partial)
59{
60 struct sha1_state *sctx = shash_desc_ctx(desc);
61 unsigned int done = 0;
62
63 sctx->count += len;
64
65 if (partial) {
66 done = SHA1_BLOCK_SIZE - partial;
67 memcpy(sctx->buffer + partial, data, done);
68 sha1_transform_asm(sctx->state, sctx->buffer, 1);
69 }
70
71 if (len - done >= SHA1_BLOCK_SIZE) {
72 const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
73
74 sha1_transform_asm(sctx->state, data + done, rounds);
75 done += rounds * SHA1_BLOCK_SIZE;
76 }
77
78 memcpy(sctx->buffer, data + done, len - done);
79
80 return 0;
81}
82
83static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
84 unsigned int len)
85{
86 struct sha1_state *sctx = shash_desc_ctx(desc);
87 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
88 int res;
89
90 /* Handle the fast case right here */
91 if (partial + len < SHA1_BLOCK_SIZE) {
92 sctx->count += len;
93 memcpy(sctx->buffer + partial, data, len);
94
95 return 0;
96 }
97
98 if (!irq_fpu_usable()) {
99 res = crypto_sha1_update(desc, data, len);
100 } else {
101 kernel_fpu_begin();
102 res = __sha1_ssse3_update(desc, data, len, partial);
103 kernel_fpu_end();
104 }
105
106 return res;
107}
108
109
110/* Add padding and return the message digest. */
111static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
112{
113 struct sha1_state *sctx = shash_desc_ctx(desc);
114 unsigned int i, index, padlen;
115 __be32 *dst = (__be32 *)out;
116 __be64 bits;
117 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
118
119 bits = cpu_to_be64(sctx->count << 3);
120
121 /* Pad out to 56 mod 64 and append length */
122 index = sctx->count % SHA1_BLOCK_SIZE;
123 padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
124 if (!irq_fpu_usable()) {
125 crypto_sha1_update(desc, padding, padlen);
126 crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
127 } else {
128 kernel_fpu_begin();
129 /* We need to fill a whole block for __sha1_ssse3_update() */
130 if (padlen <= 56) {
131 sctx->count += padlen;
132 memcpy(sctx->buffer + index, padding, padlen);
133 } else {
134 __sha1_ssse3_update(desc, padding, padlen, index);
135 }
136 __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
137 kernel_fpu_end();
138 }
139
140 /* Store state in digest */
141 for (i = 0; i < 5; i++)
142 dst[i] = cpu_to_be32(sctx->state[i]);
143
144 /* Wipe context */
145 memset(sctx, 0, sizeof(*sctx));
146
147 return 0;
148}
149
150static int sha1_ssse3_export(struct shash_desc *desc, void *out)
151{
152 struct sha1_state *sctx = shash_desc_ctx(desc);
153
154 memcpy(out, sctx, sizeof(*sctx));
155
156 return 0;
157}
158
159static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
160{
161 struct sha1_state *sctx = shash_desc_ctx(desc);
162
163 memcpy(sctx, in, sizeof(*sctx));
164
165 return 0;
166}
167
168static struct shash_alg alg = {
169 .digestsize = SHA1_DIGEST_SIZE,
170 .init = sha1_ssse3_init,
171 .update = sha1_ssse3_update,
172 .final = sha1_ssse3_final,
173 .export = sha1_ssse3_export,
174 .import = sha1_ssse3_import,
175 .descsize = sizeof(struct sha1_state),
176 .statesize = sizeof(struct sha1_state),
177 .base = {
178 .cra_name = "sha1",
179 .cra_driver_name= "sha1-ssse3",
180 .cra_priority = 150,
181 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
182 .cra_blocksize = SHA1_BLOCK_SIZE,
183 .cra_module = THIS_MODULE,
184 }
185};
186
187#ifdef CONFIG_AS_AVX
188static bool __init avx_usable(void)
189{
190 u64 xcr0;
191
192 if (!cpu_has_avx || !cpu_has_osxsave)
193 return false;
194
195 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
196 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
197 pr_info("AVX detected but unusable.\n");
198
199 return false;
200 }
201
202 return true;
203}
204#endif
205
206static int __init sha1_ssse3_mod_init(void)
207{
208 /* test for SSSE3 first */
209 if (cpu_has_ssse3)
210 sha1_transform_asm = sha1_transform_ssse3;
211
212#ifdef CONFIG_AS_AVX
213 /* allow AVX to override SSSE3, it's a little faster */
214 if (avx_usable())
215 sha1_transform_asm = sha1_transform_avx;
216#endif
217
218 if (sha1_transform_asm) {
219 pr_info("Using %s optimized SHA-1 implementation\n",
220 sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3"
221 : "AVX");
222 return crypto_register_shash(&alg);
223 }
224 pr_info("Neither AVX nor SSSE3 is available/usable.\n");
225
226 return -ENODEV;
227}
228
229static void __exit sha1_ssse3_mod_fini(void)
230{
231 crypto_unregister_shash(&alg);
232}
233
234module_init(sha1_ssse3_mod_init);
235module_exit(sha1_ssse3_mod_fini);
236
237MODULE_LICENSE("GPL");
238MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
239
240MODULE_ALIAS("sha1");
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
deleted file mode 100644
index ebac16bfa83..00000000000
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ /dev/null
@@ -1,423 +0,0 @@
1/*
2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * USA
23 *
24 */
25
26#include "glue_helper-asm-avx.S"
27
28.file "twofish-avx-x86_64-asm_64.S"
29
30.data
31.align 16
32
33.Lbswap128_mask:
34 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
35
36.text
37
38/* structure of crypto context */
39#define s0 0
40#define s1 1024
41#define s2 2048
42#define s3 3072
43#define w 4096
44#define k 4128
45
46/**********************************************************************
47 8-way AVX twofish
48 **********************************************************************/
49#define CTX %rdi
50
51#define RA1 %xmm0
52#define RB1 %xmm1
53#define RC1 %xmm2
54#define RD1 %xmm3
55
56#define RA2 %xmm4
57#define RB2 %xmm5
58#define RC2 %xmm6
59#define RD2 %xmm7
60
61#define RX0 %xmm8
62#define RY0 %xmm9
63
64#define RX1 %xmm10
65#define RY1 %xmm11
66
67#define RK1 %xmm12
68#define RK2 %xmm13
69
70#define RT %xmm14
71#define RR %xmm15
72
73#define RID1 %rbp
74#define RID1d %ebp
75#define RID2 %rsi
76#define RID2d %esi
77
78#define RGI1 %rdx
79#define RGI1bl %dl
80#define RGI1bh %dh
81#define RGI2 %rcx
82#define RGI2bl %cl
83#define RGI2bh %ch
84
85#define RGI3 %rax
86#define RGI3bl %al
87#define RGI3bh %ah
88#define RGI4 %rbx
89#define RGI4bl %bl
90#define RGI4bh %bh
91
92#define RGS1 %r8
93#define RGS1d %r8d
94#define RGS2 %r9
95#define RGS2d %r9d
96#define RGS3 %r10
97#define RGS3d %r10d
98
99
100#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
101 movzbl src ## bl, RID1d; \
102 movzbl src ## bh, RID2d; \
103 shrq $16, src; \
104 movl t0(CTX, RID1, 4), dst ## d; \
105 movl t1(CTX, RID2, 4), RID2d; \
106 movzbl src ## bl, RID1d; \
107 xorl RID2d, dst ## d; \
108 movzbl src ## bh, RID2d; \
109 interleave_op(il_reg); \
110 xorl t2(CTX, RID1, 4), dst ## d; \
111 xorl t3(CTX, RID2, 4), dst ## d;
112
113#define dummy(d) /* do nothing */
114
115#define shr_next(reg) \
116 shrq $16, reg;
117
118#define G(gi1, gi2, x, t0, t1, t2, t3) \
119 lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
120 lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
121 \
122 lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
123 shlq $32, RGS2; \
124 orq RGS1, RGS2; \
125 lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
126 shlq $32, RGS1; \
127 orq RGS1, RGS3;
128
129#define round_head_2(a, b, x1, y1, x2, y2) \
130 vmovq b ## 1, RGI3; \
131 vpextrq $1, b ## 1, RGI4; \
132 \
133 G(RGI1, RGI2, x1, s0, s1, s2, s3); \
134 vmovq a ## 2, RGI1; \
135 vpextrq $1, a ## 2, RGI2; \
136 vmovq RGS2, x1; \
137 vpinsrq $1, RGS3, x1, x1; \
138 \
139 G(RGI3, RGI4, y1, s1, s2, s3, s0); \
140 vmovq b ## 2, RGI3; \
141 vpextrq $1, b ## 2, RGI4; \
142 vmovq RGS2, y1; \
143 vpinsrq $1, RGS3, y1, y1; \
144 \
145 G(RGI1, RGI2, x2, s0, s1, s2, s3); \
146 vmovq RGS2, x2; \
147 vpinsrq $1, RGS3, x2, x2; \
148 \
149 G(RGI3, RGI4, y2, s1, s2, s3, s0); \
150 vmovq RGS2, y2; \
151 vpinsrq $1, RGS3, y2, y2;
152
153#define encround_tail(a, b, c, d, x, y, prerotate) \
154 vpaddd x, y, x; \
155 vpaddd x, RK1, RT;\
156 prerotate(b); \
157 vpxor RT, c, c; \
158 vpaddd y, x, y; \
159 vpaddd y, RK2, y; \
160 vpsrld $1, c, RT; \
161 vpslld $(32 - 1), c, c; \
162 vpor c, RT, c; \
163 vpxor d, y, d; \
164
165#define decround_tail(a, b, c, d, x, y, prerotate) \
166 vpaddd x, y, x; \
167 vpaddd x, RK1, RT;\
168 prerotate(a); \
169 vpxor RT, c, c; \
170 vpaddd y, x, y; \
171 vpaddd y, RK2, y; \
172 vpxor d, y, d; \
173 vpsrld $1, d, y; \
174 vpslld $(32 - 1), d, d; \
175 vpor d, y, d; \
176
177#define rotate_1l(x) \
178 vpslld $1, x, RR; \
179 vpsrld $(32 - 1), x, x; \
180 vpor x, RR, x;
181
182#define preload_rgi(c) \
183 vmovq c, RGI1; \
184 vpextrq $1, c, RGI2;
185
186#define encrypt_round(n, a, b, c, d, preload, prerotate) \
187 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
188 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
189 round_head_2(a, b, RX0, RY0, RX1, RY1); \
190 encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
191 preload(c ## 1); \
192 encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
193
194#define decrypt_round(n, a, b, c, d, preload, prerotate) \
195 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
196 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
197 round_head_2(a, b, RX0, RY0, RX1, RY1); \
198 decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
199 preload(c ## 1); \
200 decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
201
202#define encrypt_cycle(n) \
203 encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
204 encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
205
206#define encrypt_cycle_last(n) \
207 encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
208 encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
209
210#define decrypt_cycle(n) \
211 decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
212 decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
213
214#define decrypt_cycle_last(n) \
215 decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
216 decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
217
218#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
219 vpunpckldq x1, x0, t0; \
220 vpunpckhdq x1, x0, t2; \
221 vpunpckldq x3, x2, t1; \
222 vpunpckhdq x3, x2, x3; \
223 \
224 vpunpcklqdq t1, t0, x0; \
225 vpunpckhqdq t1, t0, x1; \
226 vpunpcklqdq x3, t2, x2; \
227 vpunpckhqdq x3, t2, x3;
228
229#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
230 vpxor x0, wkey, x0; \
231 vpxor x1, wkey, x1; \
232 vpxor x2, wkey, x2; \
233 vpxor x3, wkey, x3; \
234 \
235 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
236
237#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
238 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
239 \
240 vpxor x0, wkey, x0; \
241 vpxor x1, wkey, x1; \
242 vpxor x2, wkey, x2; \
243 vpxor x3, wkey, x3;
244
245.align 8
246.type __twofish_enc_blk8,@function;
247
248__twofish_enc_blk8:
249 /* input:
250 * %rdi: ctx, CTX
251 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
252 * output:
253 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
254 */
255
256 vmovdqu w(CTX), RK1;
257
258 pushq %rbp;
259 pushq %rbx;
260 pushq %rcx;
261
262 inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
263 preload_rgi(RA1);
264 rotate_1l(RD1);
265 inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
266 rotate_1l(RD2);
267
268 encrypt_cycle(0);
269 encrypt_cycle(1);
270 encrypt_cycle(2);
271 encrypt_cycle(3);
272 encrypt_cycle(4);
273 encrypt_cycle(5);
274 encrypt_cycle(6);
275 encrypt_cycle_last(7);
276
277 vmovdqu (w+4*4)(CTX), RK1;
278
279 popq %rcx;
280 popq %rbx;
281 popq %rbp;
282
283 outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
284 outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
285
286 ret;
287
288.align 8
289.type __twofish_dec_blk8,@function;
290
291__twofish_dec_blk8:
292 /* input:
293 * %rdi: ctx, CTX
294 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
295 * output:
296 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
297 */
298
299 vmovdqu (w+4*4)(CTX), RK1;
300
301 pushq %rbp;
302 pushq %rbx;
303
304 inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
305 preload_rgi(RC1);
306 rotate_1l(RA1);
307 inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
308 rotate_1l(RA2);
309
310 decrypt_cycle(7);
311 decrypt_cycle(6);
312 decrypt_cycle(5);
313 decrypt_cycle(4);
314 decrypt_cycle(3);
315 decrypt_cycle(2);
316 decrypt_cycle(1);
317 decrypt_cycle_last(0);
318
319 vmovdqu (w)(CTX), RK1;
320
321 popq %rbx;
322 popq %rbp;
323
324 outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
325 outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
326
327 ret;
328
329.align 8
330.global twofish_ecb_enc_8way
331.type twofish_ecb_enc_8way,@function;
332
333twofish_ecb_enc_8way:
334 /* input:
335 * %rdi: ctx, CTX
336 * %rsi: dst
337 * %rdx: src
338 */
339
340 movq %rsi, %r11;
341
342 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
343
344 call __twofish_enc_blk8;
345
346 store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
347
348 ret;
349
350.align 8
351.global twofish_ecb_dec_8way
352.type twofish_ecb_dec_8way,@function;
353
354twofish_ecb_dec_8way:
355 /* input:
356 * %rdi: ctx, CTX
357 * %rsi: dst
358 * %rdx: src
359 */
360
361 movq %rsi, %r11;
362
363 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
364
365 call __twofish_dec_blk8;
366
367 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
368
369 ret;
370
371.align 8
372.global twofish_cbc_dec_8way
373.type twofish_cbc_dec_8way,@function;
374
375twofish_cbc_dec_8way:
376 /* input:
377 * %rdi: ctx, CTX
378 * %rsi: dst
379 * %rdx: src
380 */
381
382 pushq %r12;
383
384 movq %rsi, %r11;
385 movq %rdx, %r12;
386
387 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
388
389 call __twofish_dec_blk8;
390
391 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
392
393 popq %r12;
394
395 ret;
396
397.align 8
398.global twofish_ctr_8way
399.type twofish_ctr_8way,@function;
400
401twofish_ctr_8way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 * %rcx: iv (little endian, 128bit)
407 */
408
409 pushq %r12;
410
411 movq %rsi, %r11;
412 movq %rdx, %r12;
413
414 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
415 RD2, RX0, RX1, RY0);
416
417 call __twofish_enc_blk8;
418
419 store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
420
421 popq %r12;
422
423 ret;
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 658af4bb35c..575331cb2a8 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -26,7 +26,7 @@
26 26
27#define in_blk 12 /* input byte array address parameter*/ 27#define in_blk 12 /* input byte array address parameter*/
28#define out_blk 8 /* output byte array address parameter*/ 28#define out_blk 8 /* output byte array address parameter*/
29#define ctx 4 /* Twofish context structure */ 29#define tfm 4 /* Twofish context structure */
30 30
31#define a_offset 0 31#define a_offset 0
32#define b_offset 4 32#define b_offset 4
@@ -229,8 +229,8 @@ twofish_enc_blk:
229 push %esi 229 push %esi
230 push %edi 230 push %edi
231 231
232 mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base 232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
233 * pointer to the ctx address */ 233 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
234 mov in_blk+16(%esp),%edi /* input address in edi */ 234 mov in_blk+16(%esp),%edi /* input address in edi */
235 235
236 mov (%edi), %eax 236 mov (%edi), %eax
@@ -285,8 +285,8 @@ twofish_dec_blk:
285 push %edi 285 push %edi
286 286
287 287
288 mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base 288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
289 * pointer to the ctx address */ 289 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
290 mov in_blk+16(%esp),%edi /* input address in edi */ 290 mov in_blk+16(%esp),%edi /* input address in edi */
291 291
292 mov (%edi), %eax 292 mov (%edi), %eax
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
deleted file mode 100644
index 5b012a2c511..00000000000
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ /dev/null
@@ -1,316 +0,0 @@
1/*
2 * Twofish Cipher 3-way parallel algorithm (x86_64)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23.file "twofish-x86_64-asm-3way.S"
24.text
25
26/* structure of crypto context */
27#define s0 0
28#define s1 1024
29#define s2 2048
30#define s3 3072
31#define w 4096
32#define k 4128
33
34/**********************************************************************
35 3-way twofish
36 **********************************************************************/
37#define CTX %rdi
38#define RIO %rdx
39
40#define RAB0 %rax
41#define RAB1 %rbx
42#define RAB2 %rcx
43
44#define RAB0d %eax
45#define RAB1d %ebx
46#define RAB2d %ecx
47
48#define RAB0bh %ah
49#define RAB1bh %bh
50#define RAB2bh %ch
51
52#define RAB0bl %al
53#define RAB1bl %bl
54#define RAB2bl %cl
55
56#define RCD0 %r8
57#define RCD1 %r9
58#define RCD2 %r10
59
60#define RCD0d %r8d
61#define RCD1d %r9d
62#define RCD2d %r10d
63
64#define RX0 %rbp
65#define RX1 %r11
66#define RX2 %r12
67
68#define RX0d %ebp
69#define RX1d %r11d
70#define RX2d %r12d
71
72#define RY0 %r13
73#define RY1 %r14
74#define RY2 %r15
75
76#define RY0d %r13d
77#define RY1d %r14d
78#define RY2d %r15d
79
80#define RT0 %rdx
81#define RT1 %rsi
82
83#define RT0d %edx
84#define RT1d %esi
85
86#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
87 movzbl ab ## bl, tmp2 ## d; \
88 movzbl ab ## bh, tmp1 ## d; \
89 rorq $(rot), ab; \
90 op1##l T0(CTX, tmp2, 4), dst ## d; \
91 op2##l T1(CTX, tmp1, 4), dst ## d;
92
93/*
94 * Combined G1 & G2 function. Reordered with help of rotates to have moves
95 * at begining.
96 */
97#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
98 /* G1,1 && G2,1 */ \
99 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
100 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
101 \
102 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
103 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
104 \
105 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
106 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
107 \
108 /* G1,2 && G2,2 */ \
109 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
110 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
111 xchgq cd ## 0, ab ## 0; \
112 \
113 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
114 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
115 xchgq cd ## 1, ab ## 1; \
116 \
117 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
118 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
119 xchgq cd ## 2, ab ## 2;
120
121#define enc_round_end(ab, x, y, n) \
122 addl y ## d, x ## d; \
123 addl x ## d, y ## d; \
124 addl k+4*(2*(n))(CTX), x ## d; \
125 xorl ab ## d, x ## d; \
126 addl k+4*(2*(n)+1)(CTX), y ## d; \
127 shrq $32, ab; \
128 roll $1, ab ## d; \
129 xorl y ## d, ab ## d; \
130 shlq $32, ab; \
131 rorl $1, x ## d; \
132 orq x, ab;
133
134#define dec_round_end(ba, x, y, n) \
135 addl y ## d, x ## d; \
136 addl x ## d, y ## d; \
137 addl k+4*(2*(n))(CTX), x ## d; \
138 addl k+4*(2*(n)+1)(CTX), y ## d; \
139 xorl ba ## d, y ## d; \
140 shrq $32, ba; \
141 roll $1, ba ## d; \
142 xorl x ## d, ba ## d; \
143 shlq $32, ba; \
144 rorl $1, y ## d; \
145 orq y, ba;
146
147#define encrypt_round3(ab, cd, n) \
148 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
149 \
150 enc_round_end(ab ## 0, RX0, RY0, n); \
151 enc_round_end(ab ## 1, RX1, RY1, n); \
152 enc_round_end(ab ## 2, RX2, RY2, n);
153
154#define decrypt_round3(ba, dc, n) \
155 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
156 \
157 dec_round_end(ba ## 0, RX0, RY0, n); \
158 dec_round_end(ba ## 1, RX1, RY1, n); \
159 dec_round_end(ba ## 2, RX2, RY2, n);
160
161#define encrypt_cycle3(ab, cd, n) \
162 encrypt_round3(ab, cd, n*2); \
163 encrypt_round3(ab, cd, (n*2)+1);
164
165#define decrypt_cycle3(ba, dc, n) \
166 decrypt_round3(ba, dc, (n*2)+1); \
167 decrypt_round3(ba, dc, (n*2));
168
169#define inpack3(in, n, xy, m) \
170 movq 4*(n)(in), xy ## 0; \
171 xorq w+4*m(CTX), xy ## 0; \
172 \
173 movq 4*(4+(n))(in), xy ## 1; \
174 xorq w+4*m(CTX), xy ## 1; \
175 \
176 movq 4*(8+(n))(in), xy ## 2; \
177 xorq w+4*m(CTX), xy ## 2;
178
179#define outunpack3(op, out, n, xy, m) \
180 xorq w+4*m(CTX), xy ## 0; \
181 op ## q xy ## 0, 4*(n)(out); \
182 \
183 xorq w+4*m(CTX), xy ## 1; \
184 op ## q xy ## 1, 4*(4+(n))(out); \
185 \
186 xorq w+4*m(CTX), xy ## 2; \
187 op ## q xy ## 2, 4*(8+(n))(out);
188
189#define inpack_enc3() \
190 inpack3(RIO, 0, RAB, 0); \
191 inpack3(RIO, 2, RCD, 2);
192
193#define outunpack_enc3(op) \
194 outunpack3(op, RIO, 2, RAB, 6); \
195 outunpack3(op, RIO, 0, RCD, 4);
196
197#define inpack_dec3() \
198 inpack3(RIO, 0, RAB, 4); \
199 rorq $32, RAB0; \
200 rorq $32, RAB1; \
201 rorq $32, RAB2; \
202 inpack3(RIO, 2, RCD, 6); \
203 rorq $32, RCD0; \
204 rorq $32, RCD1; \
205 rorq $32, RCD2;
206
207#define outunpack_dec3() \
208 rorq $32, RCD0; \
209 rorq $32, RCD1; \
210 rorq $32, RCD2; \
211 outunpack3(mov, RIO, 0, RCD, 0); \
212 rorq $32, RAB0; \
213 rorq $32, RAB1; \
214 rorq $32, RAB2; \
215 outunpack3(mov, RIO, 2, RAB, 2);
216
217.align 8
218.global __twofish_enc_blk_3way
219.type __twofish_enc_blk_3way,@function;
220
221__twofish_enc_blk_3way:
222 /* input:
223 * %rdi: ctx, CTX
224 * %rsi: dst
225 * %rdx: src, RIO
226 * %rcx: bool, if true: xor output
227 */
228 pushq %r15;
229 pushq %r14;
230 pushq %r13;
231 pushq %r12;
232 pushq %rbp;
233 pushq %rbx;
234
235 pushq %rcx; /* bool xor */
236 pushq %rsi; /* dst */
237
238 inpack_enc3();
239
240 encrypt_cycle3(RAB, RCD, 0);
241 encrypt_cycle3(RAB, RCD, 1);
242 encrypt_cycle3(RAB, RCD, 2);
243 encrypt_cycle3(RAB, RCD, 3);
244 encrypt_cycle3(RAB, RCD, 4);
245 encrypt_cycle3(RAB, RCD, 5);
246 encrypt_cycle3(RAB, RCD, 6);
247 encrypt_cycle3(RAB, RCD, 7);
248
249 popq RIO; /* dst */
250 popq %rbp; /* bool xor */
251
252 testb %bpl, %bpl;
253 jnz __enc_xor3;
254
255 outunpack_enc3(mov);
256
257 popq %rbx;
258 popq %rbp;
259 popq %r12;
260 popq %r13;
261 popq %r14;
262 popq %r15;
263 ret;
264
265__enc_xor3:
266 outunpack_enc3(xor);
267
268 popq %rbx;
269 popq %rbp;
270 popq %r12;
271 popq %r13;
272 popq %r14;
273 popq %r15;
274 ret;
275
276.global twofish_dec_blk_3way
277.type twofish_dec_blk_3way,@function;
278
279twofish_dec_blk_3way:
280 /* input:
281 * %rdi: ctx, CTX
282 * %rsi: dst
283 * %rdx: src, RIO
284 */
285 pushq %r15;
286 pushq %r14;
287 pushq %r13;
288 pushq %r12;
289 pushq %rbp;
290 pushq %rbx;
291
292 pushq %rsi; /* dst */
293
294 inpack_dec3();
295
296 decrypt_cycle3(RAB, RCD, 7);
297 decrypt_cycle3(RAB, RCD, 6);
298 decrypt_cycle3(RAB, RCD, 5);
299 decrypt_cycle3(RAB, RCD, 4);
300 decrypt_cycle3(RAB, RCD, 3);
301 decrypt_cycle3(RAB, RCD, 2);
302 decrypt_cycle3(RAB, RCD, 1);
303 decrypt_cycle3(RAB, RCD, 0);
304
305 popq RIO; /* dst */
306
307 outunpack_dec3();
308
309 popq %rbx;
310 popq %rbp;
311 popq %r12;
312 popq %r13;
313 popq %r14;
314 popq %r15;
315 ret;
316
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 7bcf3fcc366..573aa102542 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -221,9 +221,10 @@
221twofish_enc_blk: 221twofish_enc_blk:
222 pushq R1 222 pushq R1
223 223
224 /* %rdi contains the ctx address */ 224 /* %rdi contains the crypto tfm address */
225 /* %rsi contains the output address */ 225 /* %rsi contains the output address */
226 /* %rdx contains the input address */ 226 /* %rdx contains the input address */
227 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
227 /* ctx address is moved to free one non-rex register 228 /* ctx address is moved to free one non-rex register
228 as target for the 8bit high operations */ 229 as target for the 8bit high operations */
229 mov %rdi, %r11 230 mov %rdi, %r11
@@ -273,9 +274,10 @@ twofish_enc_blk:
273twofish_dec_blk: 274twofish_dec_blk:
274 pushq R1 275 pushq R1
275 276
276 /* %rdi contains the ctx address */ 277 /* %rdi contains the crypto tfm address */
277 /* %rsi contains the output address */ 278 /* %rsi contains the output address */
278 /* %rdx contains the input address */ 279 /* %rdx contains the input address */
280 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
279 /* ctx address is moved to free one non-rex register 281 /* ctx address is moved to free one non-rex register
280 as target for the 8bit high operations */ 282 as target for the 8bit high operations */
281 mov %rdi, %r11 283 mov %rdi, %r11
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
deleted file mode 100644
index 94ac91d26e4..00000000000
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ /dev/null
@@ -1,571 +0,0 @@
1/*
2 * Glue Code for AVX assembler version of Twofish Cipher
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24#include <linux/module.h>
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/crypto.h>
28#include <linux/err.h>
29#include <crypto/algapi.h>
30#include <crypto/twofish.h>
31#include <crypto/cryptd.h>
32#include <crypto/b128ops.h>
33#include <crypto/ctr.h>
34#include <crypto/lrw.h>
35#include <crypto/xts.h>
36#include <asm/i387.h>
37#include <asm/xcr.h>
38#include <asm/xsave.h>
39#include <asm/crypto/twofish.h>
40#include <asm/crypto/ablk_helper.h>
41#include <asm/crypto/glue_helper.h>
42#include <crypto/scatterwalk.h>
43#include <linux/workqueue.h>
44#include <linux/spinlock.h>
45
46#define TWOFISH_PARALLEL_BLOCKS 8
47
48/* 8-way parallel cipher functions */
49asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
50 const u8 *src);
51asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
52 const u8 *src);
53
54asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
55 const u8 *src);
56asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
57 const u8 *src, le128 *iv);
58
59static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
60 const u8 *src)
61{
62 __twofish_enc_blk_3way(ctx, dst, src, false);
63}
64
65
66static const struct common_glue_ctx twofish_enc = {
67 .num_funcs = 3,
68 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
69
70 .funcs = { {
71 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
72 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
73 }, {
74 .num_blocks = 3,
75 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
76 }, {
77 .num_blocks = 1,
78 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
79 } }
80};
81
82static const struct common_glue_ctx twofish_ctr = {
83 .num_funcs = 3,
84 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
85
86 .funcs = { {
87 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
88 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
89 }, {
90 .num_blocks = 3,
91 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
92 }, {
93 .num_blocks = 1,
94 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
95 } }
96};
97
98static const struct common_glue_ctx twofish_dec = {
99 .num_funcs = 3,
100 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
101
102 .funcs = { {
103 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
104 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
105 }, {
106 .num_blocks = 3,
107 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
108 }, {
109 .num_blocks = 1,
110 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
111 } }
112};
113
114static const struct common_glue_ctx twofish_dec_cbc = {
115 .num_funcs = 3,
116 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
117
118 .funcs = { {
119 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
120 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
121 }, {
122 .num_blocks = 3,
123 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
124 }, {
125 .num_blocks = 1,
126 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
127 } }
128};
129
130static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
134}
135
136static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
137 struct scatterlist *src, unsigned int nbytes)
138{
139 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
140}
141
142static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
143 struct scatterlist *src, unsigned int nbytes)
144{
145 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
146 dst, src, nbytes);
147}
148
149static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
150 struct scatterlist *src, unsigned int nbytes)
151{
152 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
153 nbytes);
154}
155
156static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
157 struct scatterlist *src, unsigned int nbytes)
158{
159 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
160}
161
162static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
163{
164 return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL,
165 fpu_enabled, nbytes);
166}
167
168static inline void twofish_fpu_end(bool fpu_enabled)
169{
170 glue_fpu_end(fpu_enabled);
171}
172
173struct crypt_priv {
174 struct twofish_ctx *ctx;
175 bool fpu_enabled;
176};
177
178static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
179{
180 const unsigned int bsize = TF_BLOCK_SIZE;
181 struct crypt_priv *ctx = priv;
182 int i;
183
184 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
185
186 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
187 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
188 return;
189 }
190
191 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
192 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
193
194 nbytes %= bsize * 3;
195
196 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
197 twofish_enc_blk(ctx->ctx, srcdst, srcdst);
198}
199
200static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
201{
202 const unsigned int bsize = TF_BLOCK_SIZE;
203 struct crypt_priv *ctx = priv;
204 int i;
205
206 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
207
208 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
209 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
210 return;
211 }
212
213 for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
214 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
215
216 nbytes %= bsize * 3;
217
218 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
219 twofish_dec_blk(ctx->ctx, srcdst, srcdst);
220}
221
222static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
223 struct scatterlist *src, unsigned int nbytes)
224{
225 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
226 be128 buf[TWOFISH_PARALLEL_BLOCKS];
227 struct crypt_priv crypt_ctx = {
228 .ctx = &ctx->twofish_ctx,
229 .fpu_enabled = false,
230 };
231 struct lrw_crypt_req req = {
232 .tbuf = buf,
233 .tbuflen = sizeof(buf),
234
235 .table_ctx = &ctx->lrw_table,
236 .crypt_ctx = &crypt_ctx,
237 .crypt_fn = encrypt_callback,
238 };
239 int ret;
240
241 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
242 ret = lrw_crypt(desc, dst, src, nbytes, &req);
243 twofish_fpu_end(crypt_ctx.fpu_enabled);
244
245 return ret;
246}
247
248static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
249 struct scatterlist *src, unsigned int nbytes)
250{
251 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
252 be128 buf[TWOFISH_PARALLEL_BLOCKS];
253 struct crypt_priv crypt_ctx = {
254 .ctx = &ctx->twofish_ctx,
255 .fpu_enabled = false,
256 };
257 struct lrw_crypt_req req = {
258 .tbuf = buf,
259 .tbuflen = sizeof(buf),
260
261 .table_ctx = &ctx->lrw_table,
262 .crypt_ctx = &crypt_ctx,
263 .crypt_fn = decrypt_callback,
264 };
265 int ret;
266
267 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
268 ret = lrw_crypt(desc, dst, src, nbytes, &req);
269 twofish_fpu_end(crypt_ctx.fpu_enabled);
270
271 return ret;
272}
273
274static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
275 struct scatterlist *src, unsigned int nbytes)
276{
277 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
278 be128 buf[TWOFISH_PARALLEL_BLOCKS];
279 struct crypt_priv crypt_ctx = {
280 .ctx = &ctx->crypt_ctx,
281 .fpu_enabled = false,
282 };
283 struct xts_crypt_req req = {
284 .tbuf = buf,
285 .tbuflen = sizeof(buf),
286
287 .tweak_ctx = &ctx->tweak_ctx,
288 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
289 .crypt_ctx = &crypt_ctx,
290 .crypt_fn = encrypt_callback,
291 };
292 int ret;
293
294 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
295 ret = xts_crypt(desc, dst, src, nbytes, &req);
296 twofish_fpu_end(crypt_ctx.fpu_enabled);
297
298 return ret;
299}
300
301static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
302 struct scatterlist *src, unsigned int nbytes)
303{
304 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
305 be128 buf[TWOFISH_PARALLEL_BLOCKS];
306 struct crypt_priv crypt_ctx = {
307 .ctx = &ctx->crypt_ctx,
308 .fpu_enabled = false,
309 };
310 struct xts_crypt_req req = {
311 .tbuf = buf,
312 .tbuflen = sizeof(buf),
313
314 .tweak_ctx = &ctx->tweak_ctx,
315 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
316 .crypt_ctx = &crypt_ctx,
317 .crypt_fn = decrypt_callback,
318 };
319 int ret;
320
321 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
322 ret = xts_crypt(desc, dst, src, nbytes, &req);
323 twofish_fpu_end(crypt_ctx.fpu_enabled);
324
325 return ret;
326}
327
328static struct crypto_alg twofish_algs[10] = { {
329 .cra_name = "__ecb-twofish-avx",
330 .cra_driver_name = "__driver-ecb-twofish-avx",
331 .cra_priority = 0,
332 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
333 .cra_blocksize = TF_BLOCK_SIZE,
334 .cra_ctxsize = sizeof(struct twofish_ctx),
335 .cra_alignmask = 0,
336 .cra_type = &crypto_blkcipher_type,
337 .cra_module = THIS_MODULE,
338 .cra_u = {
339 .blkcipher = {
340 .min_keysize = TF_MIN_KEY_SIZE,
341 .max_keysize = TF_MAX_KEY_SIZE,
342 .setkey = twofish_setkey,
343 .encrypt = ecb_encrypt,
344 .decrypt = ecb_decrypt,
345 },
346 },
347}, {
348 .cra_name = "__cbc-twofish-avx",
349 .cra_driver_name = "__driver-cbc-twofish-avx",
350 .cra_priority = 0,
351 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
352 .cra_blocksize = TF_BLOCK_SIZE,
353 .cra_ctxsize = sizeof(struct twofish_ctx),
354 .cra_alignmask = 0,
355 .cra_type = &crypto_blkcipher_type,
356 .cra_module = THIS_MODULE,
357 .cra_u = {
358 .blkcipher = {
359 .min_keysize = TF_MIN_KEY_SIZE,
360 .max_keysize = TF_MAX_KEY_SIZE,
361 .setkey = twofish_setkey,
362 .encrypt = cbc_encrypt,
363 .decrypt = cbc_decrypt,
364 },
365 },
366}, {
367 .cra_name = "__ctr-twofish-avx",
368 .cra_driver_name = "__driver-ctr-twofish-avx",
369 .cra_priority = 0,
370 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
371 .cra_blocksize = 1,
372 .cra_ctxsize = sizeof(struct twofish_ctx),
373 .cra_alignmask = 0,
374 .cra_type = &crypto_blkcipher_type,
375 .cra_module = THIS_MODULE,
376 .cra_u = {
377 .blkcipher = {
378 .min_keysize = TF_MIN_KEY_SIZE,
379 .max_keysize = TF_MAX_KEY_SIZE,
380 .ivsize = TF_BLOCK_SIZE,
381 .setkey = twofish_setkey,
382 .encrypt = ctr_crypt,
383 .decrypt = ctr_crypt,
384 },
385 },
386}, {
387 .cra_name = "__lrw-twofish-avx",
388 .cra_driver_name = "__driver-lrw-twofish-avx",
389 .cra_priority = 0,
390 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
391 .cra_blocksize = TF_BLOCK_SIZE,
392 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
393 .cra_alignmask = 0,
394 .cra_type = &crypto_blkcipher_type,
395 .cra_module = THIS_MODULE,
396 .cra_exit = lrw_twofish_exit_tfm,
397 .cra_u = {
398 .blkcipher = {
399 .min_keysize = TF_MIN_KEY_SIZE +
400 TF_BLOCK_SIZE,
401 .max_keysize = TF_MAX_KEY_SIZE +
402 TF_BLOCK_SIZE,
403 .ivsize = TF_BLOCK_SIZE,
404 .setkey = lrw_twofish_setkey,
405 .encrypt = lrw_encrypt,
406 .decrypt = lrw_decrypt,
407 },
408 },
409}, {
410 .cra_name = "__xts-twofish-avx",
411 .cra_driver_name = "__driver-xts-twofish-avx",
412 .cra_priority = 0,
413 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
414 .cra_blocksize = TF_BLOCK_SIZE,
415 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
416 .cra_alignmask = 0,
417 .cra_type = &crypto_blkcipher_type,
418 .cra_module = THIS_MODULE,
419 .cra_u = {
420 .blkcipher = {
421 .min_keysize = TF_MIN_KEY_SIZE * 2,
422 .max_keysize = TF_MAX_KEY_SIZE * 2,
423 .ivsize = TF_BLOCK_SIZE,
424 .setkey = xts_twofish_setkey,
425 .encrypt = xts_encrypt,
426 .decrypt = xts_decrypt,
427 },
428 },
429}, {
430 .cra_name = "ecb(twofish)",
431 .cra_driver_name = "ecb-twofish-avx",
432 .cra_priority = 400,
433 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
434 .cra_blocksize = TF_BLOCK_SIZE,
435 .cra_ctxsize = sizeof(struct async_helper_ctx),
436 .cra_alignmask = 0,
437 .cra_type = &crypto_ablkcipher_type,
438 .cra_module = THIS_MODULE,
439 .cra_init = ablk_init,
440 .cra_exit = ablk_exit,
441 .cra_u = {
442 .ablkcipher = {
443 .min_keysize = TF_MIN_KEY_SIZE,
444 .max_keysize = TF_MAX_KEY_SIZE,
445 .setkey = ablk_set_key,
446 .encrypt = ablk_encrypt,
447 .decrypt = ablk_decrypt,
448 },
449 },
450}, {
451 .cra_name = "cbc(twofish)",
452 .cra_driver_name = "cbc-twofish-avx",
453 .cra_priority = 400,
454 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
455 .cra_blocksize = TF_BLOCK_SIZE,
456 .cra_ctxsize = sizeof(struct async_helper_ctx),
457 .cra_alignmask = 0,
458 .cra_type = &crypto_ablkcipher_type,
459 .cra_module = THIS_MODULE,
460 .cra_init = ablk_init,
461 .cra_exit = ablk_exit,
462 .cra_u = {
463 .ablkcipher = {
464 .min_keysize = TF_MIN_KEY_SIZE,
465 .max_keysize = TF_MAX_KEY_SIZE,
466 .ivsize = TF_BLOCK_SIZE,
467 .setkey = ablk_set_key,
468 .encrypt = __ablk_encrypt,
469 .decrypt = ablk_decrypt,
470 },
471 },
472}, {
473 .cra_name = "ctr(twofish)",
474 .cra_driver_name = "ctr-twofish-avx",
475 .cra_priority = 400,
476 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
477 .cra_blocksize = 1,
478 .cra_ctxsize = sizeof(struct async_helper_ctx),
479 .cra_alignmask = 0,
480 .cra_type = &crypto_ablkcipher_type,
481 .cra_module = THIS_MODULE,
482 .cra_init = ablk_init,
483 .cra_exit = ablk_exit,
484 .cra_u = {
485 .ablkcipher = {
486 .min_keysize = TF_MIN_KEY_SIZE,
487 .max_keysize = TF_MAX_KEY_SIZE,
488 .ivsize = TF_BLOCK_SIZE,
489 .setkey = ablk_set_key,
490 .encrypt = ablk_encrypt,
491 .decrypt = ablk_encrypt,
492 .geniv = "chainiv",
493 },
494 },
495}, {
496 .cra_name = "lrw(twofish)",
497 .cra_driver_name = "lrw-twofish-avx",
498 .cra_priority = 400,
499 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
500 .cra_blocksize = TF_BLOCK_SIZE,
501 .cra_ctxsize = sizeof(struct async_helper_ctx),
502 .cra_alignmask = 0,
503 .cra_type = &crypto_ablkcipher_type,
504 .cra_module = THIS_MODULE,
505 .cra_init = ablk_init,
506 .cra_exit = ablk_exit,
507 .cra_u = {
508 .ablkcipher = {
509 .min_keysize = TF_MIN_KEY_SIZE +
510 TF_BLOCK_SIZE,
511 .max_keysize = TF_MAX_KEY_SIZE +
512 TF_BLOCK_SIZE,
513 .ivsize = TF_BLOCK_SIZE,
514 .setkey = ablk_set_key,
515 .encrypt = ablk_encrypt,
516 .decrypt = ablk_decrypt,
517 },
518 },
519}, {
520 .cra_name = "xts(twofish)",
521 .cra_driver_name = "xts-twofish-avx",
522 .cra_priority = 400,
523 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
524 .cra_blocksize = TF_BLOCK_SIZE,
525 .cra_ctxsize = sizeof(struct async_helper_ctx),
526 .cra_alignmask = 0,
527 .cra_type = &crypto_ablkcipher_type,
528 .cra_module = THIS_MODULE,
529 .cra_init = ablk_init,
530 .cra_exit = ablk_exit,
531 .cra_u = {
532 .ablkcipher = {
533 .min_keysize = TF_MIN_KEY_SIZE * 2,
534 .max_keysize = TF_MAX_KEY_SIZE * 2,
535 .ivsize = TF_BLOCK_SIZE,
536 .setkey = ablk_set_key,
537 .encrypt = ablk_encrypt,
538 .decrypt = ablk_decrypt,
539 },
540 },
541} };
542
543static int __init twofish_init(void)
544{
545 u64 xcr0;
546
547 if (!cpu_has_avx || !cpu_has_osxsave) {
548 printk(KERN_INFO "AVX instructions are not detected.\n");
549 return -ENODEV;
550 }
551
552 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
553 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
554 printk(KERN_INFO "AVX detected but unusable.\n");
555 return -ENODEV;
556 }
557
558 return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
559}
560
561static void __exit twofish_exit(void)
562{
563 crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
564}
565
566module_init(twofish_init);
567module_exit(twofish_exit);
568
569MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
570MODULE_LICENSE("GPL");
571MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 0a520230350..cefaf8b9aa1 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -44,21 +44,17 @@
44#include <linux/module.h> 44#include <linux/module.h>
45#include <linux/types.h> 45#include <linux/types.h>
46 46
47asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, 47asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
48 const u8 *src); 48asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
49EXPORT_SYMBOL_GPL(twofish_enc_blk);
50asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
51 const u8 *src);
52EXPORT_SYMBOL_GPL(twofish_dec_blk);
53 49
54static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 50static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
55{ 51{
56 twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src); 52 twofish_enc_blk(tfm, dst, src);
57} 53}
58 54
59static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 55static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
60{ 56{
61 twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src); 57 twofish_dec_blk(tfm, dst, src);
62} 58}
63 59
64static struct crypto_alg alg = { 60static struct crypto_alg alg = {
@@ -68,8 +64,9 @@ static struct crypto_alg alg = {
68 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 64 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
69 .cra_blocksize = TF_BLOCK_SIZE, 65 .cra_blocksize = TF_BLOCK_SIZE,
70 .cra_ctxsize = sizeof(struct twofish_ctx), 66 .cra_ctxsize = sizeof(struct twofish_ctx),
71 .cra_alignmask = 0, 67 .cra_alignmask = 3,
72 .cra_module = THIS_MODULE, 68 .cra_module = THIS_MODULE,
69 .cra_list = LIST_HEAD_INIT(alg.cra_list),
73 .cra_u = { 70 .cra_u = {
74 .cipher = { 71 .cipher = {
75 .cia_min_keysize = TF_MIN_KEY_SIZE, 72 .cia_min_keysize = TF_MIN_KEY_SIZE,
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
deleted file mode 100644
index 13e63b3e1df..00000000000
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ /dev/null
@@ -1,499 +0,0 @@
1/*
2 * Glue Code for 3-way parallel assembler optimized version of Twofish
3 *
4 * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
19 * USA
20 *
21 */
22
23#include <asm/processor.h>
24#include <linux/crypto.h>
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/types.h>
28#include <crypto/algapi.h>
29#include <crypto/twofish.h>
30#include <crypto/b128ops.h>
31#include <asm/crypto/twofish.h>
32#include <asm/crypto/glue_helper.h>
33#include <crypto/lrw.h>
34#include <crypto/xts.h>
35
36EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
37EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
38
39static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
40 const u8 *src)
41{
42 __twofish_enc_blk_3way(ctx, dst, src, false);
43}
44
45static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
46 const u8 *src)
47{
48 __twofish_enc_blk_3way(ctx, dst, src, true);
49}
50
51void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
52{
53 u128 ivs[2];
54
55 ivs[0] = src[0];
56 ivs[1] = src[1];
57
58 twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
59
60 u128_xor(&dst[1], &dst[1], &ivs[0]);
61 u128_xor(&dst[2], &dst[2], &ivs[1]);
62}
63EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
64
65void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
66{
67 be128 ctrblk;
68
69 if (dst != src)
70 *dst = *src;
71
72 le128_to_be128(&ctrblk, iv);
73 le128_inc(iv);
74
75 twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
76 u128_xor(dst, dst, (u128 *)&ctrblk);
77}
78EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
79
80void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
81 le128 *iv)
82{
83 be128 ctrblks[3];
84
85 if (dst != src) {
86 dst[0] = src[0];
87 dst[1] = src[1];
88 dst[2] = src[2];
89 }
90
91 le128_to_be128(&ctrblks[0], iv);
92 le128_inc(iv);
93 le128_to_be128(&ctrblks[1], iv);
94 le128_inc(iv);
95 le128_to_be128(&ctrblks[2], iv);
96 le128_inc(iv);
97
98 twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
99}
100EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
101
102static const struct common_glue_ctx twofish_enc = {
103 .num_funcs = 2,
104 .fpu_blocks_limit = -1,
105
106 .funcs = { {
107 .num_blocks = 3,
108 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
109 }, {
110 .num_blocks = 1,
111 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
112 } }
113};
114
115static const struct common_glue_ctx twofish_ctr = {
116 .num_funcs = 2,
117 .fpu_blocks_limit = -1,
118
119 .funcs = { {
120 .num_blocks = 3,
121 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
122 }, {
123 .num_blocks = 1,
124 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
125 } }
126};
127
128static const struct common_glue_ctx twofish_dec = {
129 .num_funcs = 2,
130 .fpu_blocks_limit = -1,
131
132 .funcs = { {
133 .num_blocks = 3,
134 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
135 }, {
136 .num_blocks = 1,
137 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
138 } }
139};
140
141static const struct common_glue_ctx twofish_dec_cbc = {
142 .num_funcs = 2,
143 .fpu_blocks_limit = -1,
144
145 .funcs = { {
146 .num_blocks = 3,
147 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
148 }, {
149 .num_blocks = 1,
150 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
151 } }
152};
153
154static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
155 struct scatterlist *src, unsigned int nbytes)
156{
157 return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
158}
159
160static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
162{
163 return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
164}
165
166static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
167 struct scatterlist *src, unsigned int nbytes)
168{
169 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
170 dst, src, nbytes);
171}
172
173static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
174 struct scatterlist *src, unsigned int nbytes)
175{
176 return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
177 nbytes);
178}
179
180static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
181 struct scatterlist *src, unsigned int nbytes)
182{
183 return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
184}
185
186static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
187{
188 const unsigned int bsize = TF_BLOCK_SIZE;
189 struct twofish_ctx *ctx = priv;
190 int i;
191
192 if (nbytes == 3 * bsize) {
193 twofish_enc_blk_3way(ctx, srcdst, srcdst);
194 return;
195 }
196
197 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
198 twofish_enc_blk(ctx, srcdst, srcdst);
199}
200
201static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
202{
203 const unsigned int bsize = TF_BLOCK_SIZE;
204 struct twofish_ctx *ctx = priv;
205 int i;
206
207 if (nbytes == 3 * bsize) {
208 twofish_dec_blk_3way(ctx, srcdst, srcdst);
209 return;
210 }
211
212 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
213 twofish_dec_blk(ctx, srcdst, srcdst);
214}
215
216int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
217 unsigned int keylen)
218{
219 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
220 int err;
221
222 err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE,
223 &tfm->crt_flags);
224 if (err)
225 return err;
226
227 return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
228}
229EXPORT_SYMBOL_GPL(lrw_twofish_setkey);
230
231static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
232 struct scatterlist *src, unsigned int nbytes)
233{
234 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
235 be128 buf[3];
236 struct lrw_crypt_req req = {
237 .tbuf = buf,
238 .tbuflen = sizeof(buf),
239
240 .table_ctx = &ctx->lrw_table,
241 .crypt_ctx = &ctx->twofish_ctx,
242 .crypt_fn = encrypt_callback,
243 };
244
245 return lrw_crypt(desc, dst, src, nbytes, &req);
246}
247
248static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
249 struct scatterlist *src, unsigned int nbytes)
250{
251 struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
252 be128 buf[3];
253 struct lrw_crypt_req req = {
254 .tbuf = buf,
255 .tbuflen = sizeof(buf),
256
257 .table_ctx = &ctx->lrw_table,
258 .crypt_ctx = &ctx->twofish_ctx,
259 .crypt_fn = decrypt_callback,
260 };
261
262 return lrw_crypt(desc, dst, src, nbytes, &req);
263}
264
265void lrw_twofish_exit_tfm(struct crypto_tfm *tfm)
266{
267 struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
268
269 lrw_free_table(&ctx->lrw_table);
270}
271EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm);
272
273int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
274 unsigned int keylen)
275{
276 struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
277 u32 *flags = &tfm->crt_flags;
278 int err;
279
280 /* key consists of keys of equal size concatenated, therefore
281 * the length must be even
282 */
283 if (keylen % 2) {
284 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
285 return -EINVAL;
286 }
287
288 /* first half of xts-key is for crypt */
289 err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
290 if (err)
291 return err;
292
293 /* second half of xts-key is for tweak */
294 return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
295 flags);
296}
297EXPORT_SYMBOL_GPL(xts_twofish_setkey);
298
299static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
300 struct scatterlist *src, unsigned int nbytes)
301{
302 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
303 be128 buf[3];
304 struct xts_crypt_req req = {
305 .tbuf = buf,
306 .tbuflen = sizeof(buf),
307
308 .tweak_ctx = &ctx->tweak_ctx,
309 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
310 .crypt_ctx = &ctx->crypt_ctx,
311 .crypt_fn = encrypt_callback,
312 };
313
314 return xts_crypt(desc, dst, src, nbytes, &req);
315}
316
317static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
318 struct scatterlist *src, unsigned int nbytes)
319{
320 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
321 be128 buf[3];
322 struct xts_crypt_req req = {
323 .tbuf = buf,
324 .tbuflen = sizeof(buf),
325
326 .tweak_ctx = &ctx->tweak_ctx,
327 .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
328 .crypt_ctx = &ctx->crypt_ctx,
329 .crypt_fn = decrypt_callback,
330 };
331
332 return xts_crypt(desc, dst, src, nbytes, &req);
333}
334
335static struct crypto_alg tf_algs[5] = { {
336 .cra_name = "ecb(twofish)",
337 .cra_driver_name = "ecb-twofish-3way",
338 .cra_priority = 300,
339 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
340 .cra_blocksize = TF_BLOCK_SIZE,
341 .cra_ctxsize = sizeof(struct twofish_ctx),
342 .cra_alignmask = 0,
343 .cra_type = &crypto_blkcipher_type,
344 .cra_module = THIS_MODULE,
345 .cra_u = {
346 .blkcipher = {
347 .min_keysize = TF_MIN_KEY_SIZE,
348 .max_keysize = TF_MAX_KEY_SIZE,
349 .setkey = twofish_setkey,
350 .encrypt = ecb_encrypt,
351 .decrypt = ecb_decrypt,
352 },
353 },
354}, {
355 .cra_name = "cbc(twofish)",
356 .cra_driver_name = "cbc-twofish-3way",
357 .cra_priority = 300,
358 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
359 .cra_blocksize = TF_BLOCK_SIZE,
360 .cra_ctxsize = sizeof(struct twofish_ctx),
361 .cra_alignmask = 0,
362 .cra_type = &crypto_blkcipher_type,
363 .cra_module = THIS_MODULE,
364 .cra_u = {
365 .blkcipher = {
366 .min_keysize = TF_MIN_KEY_SIZE,
367 .max_keysize = TF_MAX_KEY_SIZE,
368 .ivsize = TF_BLOCK_SIZE,
369 .setkey = twofish_setkey,
370 .encrypt = cbc_encrypt,
371 .decrypt = cbc_decrypt,
372 },
373 },
374}, {
375 .cra_name = "ctr(twofish)",
376 .cra_driver_name = "ctr-twofish-3way",
377 .cra_priority = 300,
378 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
379 .cra_blocksize = 1,
380 .cra_ctxsize = sizeof(struct twofish_ctx),
381 .cra_alignmask = 0,
382 .cra_type = &crypto_blkcipher_type,
383 .cra_module = THIS_MODULE,
384 .cra_u = {
385 .blkcipher = {
386 .min_keysize = TF_MIN_KEY_SIZE,
387 .max_keysize = TF_MAX_KEY_SIZE,
388 .ivsize = TF_BLOCK_SIZE,
389 .setkey = twofish_setkey,
390 .encrypt = ctr_crypt,
391 .decrypt = ctr_crypt,
392 },
393 },
394}, {
395 .cra_name = "lrw(twofish)",
396 .cra_driver_name = "lrw-twofish-3way",
397 .cra_priority = 300,
398 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
399 .cra_blocksize = TF_BLOCK_SIZE,
400 .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
401 .cra_alignmask = 0,
402 .cra_type = &crypto_blkcipher_type,
403 .cra_module = THIS_MODULE,
404 .cra_exit = lrw_twofish_exit_tfm,
405 .cra_u = {
406 .blkcipher = {
407 .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
408 .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
409 .ivsize = TF_BLOCK_SIZE,
410 .setkey = lrw_twofish_setkey,
411 .encrypt = lrw_encrypt,
412 .decrypt = lrw_decrypt,
413 },
414 },
415}, {
416 .cra_name = "xts(twofish)",
417 .cra_driver_name = "xts-twofish-3way",
418 .cra_priority = 300,
419 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
420 .cra_blocksize = TF_BLOCK_SIZE,
421 .cra_ctxsize = sizeof(struct twofish_xts_ctx),
422 .cra_alignmask = 0,
423 .cra_type = &crypto_blkcipher_type,
424 .cra_module = THIS_MODULE,
425 .cra_u = {
426 .blkcipher = {
427 .min_keysize = TF_MIN_KEY_SIZE * 2,
428 .max_keysize = TF_MAX_KEY_SIZE * 2,
429 .ivsize = TF_BLOCK_SIZE,
430 .setkey = xts_twofish_setkey,
431 .encrypt = xts_encrypt,
432 .decrypt = xts_decrypt,
433 },
434 },
435} };
436
437static bool is_blacklisted_cpu(void)
438{
439 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
440 return false;
441
442 if (boot_cpu_data.x86 == 0x06 &&
443 (boot_cpu_data.x86_model == 0x1c ||
444 boot_cpu_data.x86_model == 0x26 ||
445 boot_cpu_data.x86_model == 0x36)) {
446 /*
447 * On Atom, twofish-3way is slower than original assembler
448 * implementation. Twofish-3way trades off some performance in
449 * storing blocks in 64bit registers to allow three blocks to
450 * be processed parallel. Parallel operation then allows gaining
451 * more performance than was trade off, on out-of-order CPUs.
452 * However Atom does not benefit from this parallellism and
453 * should be blacklisted.
454 */
455 return true;
456 }
457
458 if (boot_cpu_data.x86 == 0x0f) {
459 /*
460 * On Pentium 4, twofish-3way is slower than original assembler
461 * implementation because excessive uses of 64bit rotate and
462 * left-shifts (which are really slow on P4) needed to store and
463 * handle 128bit block in two 64bit registers.
464 */
465 return true;
466 }
467
468 return false;
469}
470
471static int force;
472module_param(force, int, 0);
473MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
474
475static int __init init(void)
476{
477 if (!force && is_blacklisted_cpu()) {
478 printk(KERN_INFO
479 "twofish-x86_64-3way: performance on this CPU "
480 "would be suboptimal: disabling "
481 "twofish-x86_64-3way.\n");
482 return -ENODEV;
483 }
484
485 return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
486}
487
488static void __exit fini(void)
489{
490 crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
491}
492
493module_init(init);
494module_exit(fini);
495
496MODULE_LICENSE("GPL");
497MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
498MODULE_ALIAS("twofish");
499MODULE_ALIAS("twofish-asm");