crypto: arm/crct10dif - port x86 SSE implementation to ARM

This is a transliteration of the Intel algorithm implemented using SSE and PCLMULQDQ instructions that resides in the file arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only operate on buffers that are 16 byte aligned (but of any size) Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Ard Biesheuvel <ard.biesheuvel@linaro.org> 2016-12-05 13:42:26 -0500
committer: Herbert Xu <herbert@gondor.apana.org.au> 2016-12-07 07:01:21 -0500
commit: 1d481f1cd8925bd92387983ea1245a0ea0f16d32 (patch)
tree: 255fbaada604e7a2cce1dc0a33cde04b2ae88472
parent: 6ef5737f39314907704d68719b74fcca11f4f342 (diff)
4 files changed, 535 insertions, 0 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index dd90e389708e..491a6edfeff6 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
          that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
          that is part of the ARMv8 Crypto Extensions
+config CRYPTO_CRCT10DIF_ARM_CE
+        tristate "CRCT10DIF digest algorithm using PMULL instructions"
+        depends on KERNEL_MODE_NEON && CRC_T10DIF
+        select CRYPTO_HASH
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y   := sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y    := aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y  := ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y      := crct10dif-ce-core.o crct10dif-ce-glue.o
 quiet_cmd_perl = PERL    $@
      cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..ce45ba0c0687
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -0,0 +1,427 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//      Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...)         code
+#endif
+        .text
+        .fpu            crypto-neon-fp-armv8
+        arg1_low32      .req    r0
+        arg2            .req    r1
+        arg3            .req    r2
+        qzr             .req    q13
+        q0l             .req    d0
+        q0h             .req    d1
+        q1l             .req    d2
+        q1h             .req    d3
+        q2l             .req    d4
+        q2h             .req    d5
+        q3l             .req    d6
+        q3h             .req    d7
+        q4l             .req    d8
+        q4h             .req    d9
+        q5l             .req    d10
+        q5h             .req    d11
+        q6l             .req    d12
+        q6h             .req    d13
+        q7l             .req    d14
+        q7h             .req    d15
+ENTRY(crc_t10dif_pmull)
+        vmov.i8         qzr, #0                 // init zero register
+        // adjust the 16-bit initial_crc value, scale it to 32 bits
+        lsl             arg1_low32, arg1_low32, #16
+        // check if smaller than 256
+        cmp             arg3, #256
+        // for sizes less than 128, we can't fold 64B at a time...
+        blt             _less_than_128
+        // load the initial crc value
+        // crc value does not need to be byte-reflected, but it needs
+        // to be moved to the high part of the register.
+        // because data will be byte-reflected and will align with
+        // initial crc at correct place.
+        vmov            s0, arg1_low32          // initial crc
+        vext.8          q10, qzr, q0, #4
+        // receive the initial 64B data, xor the initial crc value
+        vld1.64         {q0-q1}, [arg2, :128]!
+        vld1.64         {q2-q3}, [arg2, :128]!
+        vld1.64         {q4-q5}, [arg2, :128]!
+        vld1.64         {q6-q7}, [arg2, :128]!
+CPU_LE( vrev64.8        q0, q0                  )
+CPU_LE( vrev64.8        q1, q1                  )
+CPU_LE( vrev64.8        q2, q2                  )
+CPU_LE( vrev64.8        q3, q3                  )
+CPU_LE( vrev64.8        q4, q4                  )
+CPU_LE( vrev64.8        q5, q5                  )
+CPU_LE( vrev64.8        q6, q6                  )
+CPU_LE( vrev64.8        q7, q7                  )
+        vswp            d0, d1
+        vswp            d2, d3
+        vswp            d4, d5
+        vswp            d6, d7
+        vswp            d8, d9
+        vswp            d10, d11
+        vswp            d12, d13
+        vswp            d14, d15
+        // XOR the initial_crc value
+        veor.8          q0, q0, q10
+        adr             ip, rk3
+        vld1.64         {q10}, [ip, :128]       // xmm10 has rk3 and rk4
+        //
+        // we subtract 256 instead of 128 to save one instruction from the loop
+        //
+        sub             arg3, arg3, #256
+        // at this section of the code, there is 64*x+y (0<=y<64) bytes of
+        // buffer. The _fold_64_B_loop will fold 64B at a time
+        // until we have 64+y Bytes of buffer
+        // fold 64B at a time. This section of the code folds 4 vector
+        // registers in parallel
+_fold_64_B_loop:
+        .macro          fold64, reg1, reg2
+        vld1.64         {q11-q12}, [arg2, :128]!
+        vmull.p64       q8, \reg1\()h, d21
+        vmull.p64       \reg1, \reg1\()l, d20
+        vmull.p64       q9, \reg2\()h, d21
+        vmull.p64       \reg2, \reg2\()l, d20
+CPU_LE( vrev64.8        q11, q11                )
+CPU_LE( vrev64.8        q12, q12                )
+        vswp            d22, d23
+        vswp            d24, d25
+        veor.8          \reg1, \reg1, q8
+        veor.8          \reg2, \reg2, q9
+        veor.8          \reg1, \reg1, q11
+        veor.8          \reg2, \reg2, q12
+        .endm
+        fold64          q0, q1
+        fold64          q2, q3
+        fold64          q4, q5
+        fold64          q6, q7
+        subs            arg3, arg3, #128
+        // check if there is another 64B in the buffer to be able to fold
+        bge             _fold_64_B_loop
+        // at this point, the buffer pointer is pointing at the last y Bytes
+        // of the buffer the 64B of folded data is in 4 of the vector
+        // registers: v0, v1, v2, v3
+        // fold the 8 vector registers to 1 vector register with different
+        // constants
+        adr             ip, rk9
+        vld1.64         {q10}, [ip, :128]!
+        .macro          fold16, reg, rk
+        vmull.p64       q8, \reg\()l, d20
+        vmull.p64       \reg, \reg\()h, d21
+        .ifnb           \rk
+        vld1.64         {q10}, [ip, :128]!
+        .endif
+        veor.8          q7, q7, q8
+        veor.8          q7, q7, \reg
+        .endm
+        fold16          q0, rk11
+        fold16          q1, rk13
+        fold16          q2, rk15
+        fold16          q3, rk17
+        fold16          q4, rk19
+        fold16          q5, rk1
+        fold16          q6
+        // instead of 64, we add 48 to the loop counter to save 1 instruction
+        // from the loop instead of a cmp instruction, we use the negative
+        // flag with the jl instruction
+        adds            arg3, arg3, #(128-16)
+        blt             _final_reduction_for_128
+        // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+        // and the rest is in memory. We can fold 16 bytes at a time if y>=16
+        // continue folding 16B at a time
+_16B_reduction_loop:
+        vmull.p64       q8, d14, d20
+        vmull.p64       q7, d15, d21
+        veor.8          q7, q7, q8
+        vld1.64         {q0}, [arg2, :128]!
+CPU_LE( vrev64.8        q0, q0          )
+        vswp            d0, d1
+        veor.8          q7, q7, q0
+        subs            arg3, arg3, #16
+        // instead of a cmp instruction, we utilize the flags with the
+        // jge instruction equivalent of: cmp arg3, 16-16
+        // check if there is any more 16B in the buffer to be able to fold
+        bge             _16B_reduction_loop
+        // now we have 16+z bytes left to reduce, where 0<= z < 16.
+        // first, we reduce the data in the xmm7 register
+_final_reduction_for_128:
+        // check if any more data to fold. If not, compute the CRC of
+        // the final 128 bits
+        adds            arg3, arg3, #16
+        beq             _128_done
+        // here we are getting data that is less than 16 bytes.
+        // since we know that there was data before the pointer, we can
+        // offset the input pointer before the actual point, to receive
+        // exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_regs:
+        add             arg2, arg2, arg3
+        sub             arg2, arg2, #16
+        vld1.64         {q1}, [arg2]
+CPU_LE( vrev64.8        q1, q1                  )
+        vswp            d2, d3
+        // get rid of the extra data that was loaded before
+        // load the shift constant
+        adr             ip, tbl_shf_table + 16
+        sub             ip, ip, arg3
+        vld1.8          {q0}, [ip]
+        // shift v2 to the left by arg3 bytes
+        vtbl.8          d4, {d14-d15}, d0
+        vtbl.8          d5, {d14-d15}, d1
+        // shift v7 to the right by 16-arg3 bytes
+        vmov.i8         q9, #0x80
+        veor.8          q0, q0, q9
+        vtbl.8          d18, {d14-d15}, d0
+        vtbl.8          d19, {d14-d15}, d1
+        // blend
+        vshr.s8         q0, q0, #7              // convert to 8-bit mask
+        vbsl.8          q0, q2, q1
+        // fold 16 Bytes
+        vmull.p64       q8, d18, d20
+        vmull.p64       q7, d19, d21
+        veor.8          q7, q7, q8
+        veor.8          q7, q7, q0
+_128_done:
+        // compute crc of a 128-bit value
+        vldr            d20, rk5
+        vldr            d21, rk6                // rk5 and rk6 in xmm10
+        // 64b fold
+        vext.8          q0, qzr, q7, #8
+        vmull.p64       q7, d15, d20
+        veor.8          q7, q7, q0
+        // 32b fold
+        vext.8          q0, q7, qzr, #12
+        vmov            s31, s3
+        vmull.p64       q0, d0, d21
+        veor.8          q7, q0, q7
+        // barrett reduction
+_barrett:
+        vldr            d20, rk7
+        vldr            d21, rk8
+        vmull.p64       q0, d15, d20
+        vext.8          q0, qzr, q0, #12
+        vmull.p64       q0, d1, d21
+        vext.8          q0, qzr, q0, #12
+        veor.8          q7, q7, q0
+        vmov            r0, s29
+_cleanup:
+        // scale the result back to 16 bits
+        lsr             r0, r0, #16
+        bx              lr
+_less_than_128:
+        teq             arg3, #0
+        beq             _cleanup
+        vmov.i8         q0, #0
+        vmov            s3, arg1_low32          // get the initial crc value
+        vld1.64         {q7}, [arg2, :128]!
+CPU_LE( vrev64.8        q7, q7          )
+        vswp            d14, d15
+        veor.8          q7, q7, q0
+        cmp             arg3, #16
+        beq             _128_done               // exactly 16 left
+        blt             _less_than_16_left
+        // now if there is, load the constants
+        vldr            d20, rk1
+        vldr            d21, rk2                // rk1 and rk2 in xmm10
+        // check if there is enough buffer to be able to fold 16B at a time
+        subs            arg3, arg3, #32
+        addlt           arg3, arg3, #16
+        blt             _get_last_two_regs
+        b               _16B_reduction_loop
+_less_than_16_left:
+        // shl r9, 4
+        adr             ip, tbl_shf_table + 16
+        sub             ip, ip, arg3
+        vld1.8          {q0}, [ip]
+        vmov.i8         q9, #0x80
+        veor.8          q0, q0, q9
+        vtbl.8          d18, {d14-d15}, d0
+        vtbl.8          d15, {d14-d15}, d1
+        vmov            d14, d18
+        b               _128_done
+ENDPROC(crc_t10dif_pmull)
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+        .align          4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+rk3:    .quad           0x9d9d000000000000
+rk4:    .quad           0x7cf5000000000000
+rk5:    .quad           0x2d56000000000000
+rk6:    .quad           0x1368000000000000
+rk7:    .quad           0x00000001f65a57f8
+rk8:    .quad           0x000000018bb70000
+rk9:    .quad           0xceae000000000000
+rk10:   .quad           0xbfd6000000000000
+rk11:   .quad           0x1e16000000000000
+rk12:   .quad           0x713c000000000000
+rk13:   .quad           0xf7f9000000000000
+rk14:   .quad           0x80a6000000000000
+rk15:   .quad           0x044c000000000000
+rk16:   .quad           0xe658000000000000
+rk17:   .quad           0xad18000000000000
+rk18:   .quad           0xa497000000000000
+rk19:   .quad           0x6ee3000000000000
+rk20:   .quad           0xe7b5000000000000
+rk1:    .quad           0x2d56000000000000
+rk2:    .quad           0x06df000000000000
+tbl_shf_table:
+// use these values for shift constants for the tbl/tbx instruction
+// different alignments result in values as shown:
+//      DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+//      DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+//      DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+//      DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+//      DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+//      DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+//      DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+//      DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+//      DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+//      DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+//      DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+//      DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+//      DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+//      DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+//      DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+        .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+        .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+        .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+        .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..d428355cf38d
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,101 @@
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <crypto/internal/hash.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#define CRC_T10DIF_PMULL_CHUNK_SIZE     16U
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len);
+static int crct10dif_init(struct shash_desc *desc)
+{
+        u16 *crc = shash_desc_ctx(desc);
+        *crc = 0;
+        return 0;
+}
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+                            unsigned int length)
+{
+        u16 *crc = shash_desc_ctx(desc);
+        unsigned int l;
+        if (!may_use_simd()) {
+                *crc = crc_t10dif_generic(*crc, data, length);
+        } else {
+                if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
+                        l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
+                                  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+                        *crc = crc_t10dif_generic(*crc, data, l);
+                        length -= l;
+                        data += l;
+                }
+                if (length > 0) {
+                        kernel_neon_begin();
+                        *crc = crc_t10dif_pmull(*crc, data, length);
+                        kernel_neon_end();
+                }
+        }
+        return 0;
+}
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+        u16 *crc = shash_desc_ctx(desc);
+        *(u16 *)out = *crc;
+        return 0;
+}
+static struct shash_alg crc_t10dif_alg = {
+        .digestsize             = CRC_T10DIF_DIGEST_SIZE,
+        .init                   = crct10dif_init,
+        .update                 = crct10dif_update,
+        .final                  = crct10dif_final,
+        .descsize               = CRC_T10DIF_DIGEST_SIZE,
+        .base.cra_name          = "crct10dif",
+        .base.cra_driver_name   = "crct10dif-arm-ce",
+        .base.cra_priority      = 200,
+        .base.cra_blocksize     = CRC_T10DIF_BLOCK_SIZE,
+        .base.cra_module        = THIS_MODULE,
+};
+static int __init crc_t10dif_mod_init(void)
+{
+        if (!(elf_hwcap2 & HWCAP2_PMULL))
+                return -ENODEV;
+        return crypto_register_shash(&crc_t10dif_alg);
+}
+static void __exit crc_t10dif_mod_exit(void)
+{
+        crypto_unregister_shash(&crc_t10dif_alg);
+}
+module_init(crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>	2016-12-05 13:42:26 -0500
committer	Herbert Xu <herbert@gondor.apana.org.au>	2016-12-07 07:01:21 -0500
commit	1d481f1cd8925bd92387983ea1245a0ea0f16d32 (patch)
tree	255fbaada604e7a2cce1dc0a33cde04b2ae88472
parent	6ef5737f39314907704d68719b74fcca11f4f342 (diff)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index dd90e389708e..491a6edfeff6 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
120	that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)	120	that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
121	that is part of the ARMv8 Crypto Extensions	121	that is part of the ARMv8 Crypto Extensions
122		122
		123	config CRYPTO_CRCT10DIF_ARM_CE
		124	tristate "CRCT10DIF digest algorithm using PMULL instructions"
		125	depends on KERNEL_MODE_NEON && CRC_T10DIF
		126	select CRYPTO_HASH
		127
123	endif	128	endif


diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index fc5150702b64..fc77265014b7 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
13	ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o	13	ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
14	ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o	14	ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
15	ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o	15	ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
		16	ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
16		17
17	ifneq ($(ce-obj-y)$(ce-obj-m),)	18	ifneq ($(ce-obj-y)$(ce-obj-m),)
18	ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)	19	ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
36	sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o	37	sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
37	aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o	38	aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
38	ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o	39	ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
		40	crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
39		41
40	quiet_cmd_perl = PERL $@	42	quiet_cmd_perl = PERL $@
41	cmd_perl = $(PERL) $(<) > $(@)	43	cmd_perl = $(PERL) $(<) > $(@)


diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S new file mode 100644 index 000000000000..ce45ba0c0687 --- /dev/null +++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -0,0 +1,427 @@
		1	//
		2	// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
		3	//
		4	// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
		5	//
		6	// This program is free software; you can redistribute it and/or modify
		7	// it under the terms of the GNU General Public License version 2 as
		8	// published by the Free Software Foundation.
		9	//
		10
		11	//
		12	// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
		13	//
		14	// Copyright (c) 2013, Intel Corporation
		15	//
		16	// Authors:
		17	// Erdinc Ozturk <erdinc.ozturk@intel.com>
		18	// Vinodh Gopal <vinodh.gopal@intel.com>
		19	// James Guilford <james.guilford@intel.com>
		20	// Tim Chen <tim.c.chen@linux.intel.com>
		21	//
		22	// This software is available to you under a choice of one of two
		23	// licenses. You may choose to be licensed under the terms of the GNU
		24	// General Public License (GPL) Version 2, available from the file
		25	// COPYING in the main directory of this source tree, or the
		26	// OpenIB.org BSD license below:
		27	//
		28	// Redistribution and use in source and binary forms, with or without
		29	// modification, are permitted provided that the following conditions are
		30	// met:
		31	//
		32	// * Redistributions of source code must retain the above copyright
		33	// notice, this list of conditions and the following disclaimer.
		34	//
		35	// * Redistributions in binary form must reproduce the above copyright
		36	// notice, this list of conditions and the following disclaimer in the
		37	// documentation and/or other materials provided with the
		38	// distribution.
		39	//
		40	// * Neither the name of the Intel Corporation nor the names of its
		41	// contributors may be used to endorse or promote products derived from
		42	// this software without specific prior written permission.
		43	//
		44	//
		45	// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
		46	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
		47	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
		48	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
		49	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
		50	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
		51	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
		52	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
		53	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
		54	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
		55	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
		56	//
		57	// Function API:
		58	// UINT16 crc_t10dif_pcl(
		59	// UINT16 init_crc, //initial CRC value, 16 bits
		60	// const unsigned char *buf, //buffer pointer to calculate CRC on
		61	// UINT64 len //buffer length in bytes (64-bit data)
		62	// );
		63	//
		64	// Reference paper titled "Fast CRC Computation for Generic
		65	// Polynomials Using PCLMULQDQ Instruction"
		66	// URL: http://www.intel.com/content/dam/www/public/us/en/documents
		67	// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
		68	//
		69	//
		70
		71	#include <linux/linkage.h>
		72	#include <asm/assembler.h>
		73
		74	#ifdef CONFIG_CPU_ENDIAN_BE8
		75	#define CPU_LE(code...)
		76	#else
		77	#define CPU_LE(code...) code
		78	#endif
		79
		80	.text
		81	.fpu crypto-neon-fp-armv8
		82
		83	arg1_low32 .req r0
		84	arg2 .req r1
		85	arg3 .req r2
		86
		87	qzr .req q13
		88
		89	q0l .req d0
		90	q0h .req d1
		91	q1l .req d2
		92	q1h .req d3
		93	q2l .req d4
		94	q2h .req d5
		95	q3l .req d6
		96	q3h .req d7
		97	q4l .req d8
		98	q4h .req d9
		99	q5l .req d10
		100	q5h .req d11
		101	q6l .req d12
		102	q6h .req d13
		103	q7l .req d14
		104	q7h .req d15
		105
		106	ENTRY(crc_t10dif_pmull)
		107	vmov.i8 qzr, #0 // init zero register
		108
		109	// adjust the 16-bit initial_crc value, scale it to 32 bits
		110	lsl arg1_low32, arg1_low32, #16
		111
		112	// check if smaller than 256
		113	cmp arg3, #256
		114
		115	// for sizes less than 128, we can't fold 64B at a time...
		116	blt _less_than_128
		117
		118	// load the initial crc value
		119	// crc value does not need to be byte-reflected, but it needs
		120	// to be moved to the high part of the register.
		121	// because data will be byte-reflected and will align with
		122	// initial crc at correct place.
		123	vmov s0, arg1_low32 // initial crc
		124	vext.8 q10, qzr, q0, #4
		125
		126	// receive the initial 64B data, xor the initial crc value
		127	vld1.64 {q0-q1}, [arg2, :128]!
		128	vld1.64 {q2-q3}, [arg2, :128]!
		129	vld1.64 {q4-q5}, [arg2, :128]!
		130	vld1.64 {q6-q7}, [arg2, :128]!
		131	CPU_LE( vrev64.8 q0, q0 )
		132	CPU_LE( vrev64.8 q1, q1 )
		133	CPU_LE( vrev64.8 q2, q2 )
		134	CPU_LE( vrev64.8 q3, q3 )
		135	CPU_LE( vrev64.8 q4, q4 )
		136	CPU_LE( vrev64.8 q5, q5 )
		137	CPU_LE( vrev64.8 q6, q6 )
		138	CPU_LE( vrev64.8 q7, q7 )
		139
		140	vswp d0, d1
		141	vswp d2, d3
		142	vswp d4, d5
		143	vswp d6, d7
		144	vswp d8, d9
		145	vswp d10, d11
		146	vswp d12, d13
		147	vswp d14, d15
		148
		149	// XOR the initial_crc value
		150	veor.8 q0, q0, q10
		151
		152	adr ip, rk3
		153	vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
		154
		155	//
		156	// we subtract 256 instead of 128 to save one instruction from the loop
		157	//
		158	sub arg3, arg3, #256
		159
		160	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
		161	// buffer. The _fold_64_B_loop will fold 64B at a time
		162	// until we have 64+y Bytes of buffer
		163
		164
		165	// fold 64B at a time. This section of the code folds 4 vector
		166	// registers in parallel
		167	_fold_64_B_loop:
		168
		169	.macro fold64, reg1, reg2
		170	vld1.64 {q11-q12}, [arg2, :128]!
		171
		172	vmull.p64 q8, \reg1\()h, d21
		173	vmull.p64 \reg1, \reg1\()l, d20
		174	vmull.p64 q9, \reg2\()h, d21
		175	vmull.p64 \reg2, \reg2\()l, d20
		176
		177	CPU_LE( vrev64.8 q11, q11 )
		178	CPU_LE( vrev64.8 q12, q12 )
		179	vswp d22, d23
		180	vswp d24, d25
		181
		182	veor.8 \reg1, \reg1, q8
		183	veor.8 \reg2, \reg2, q9
		184	veor.8 \reg1, \reg1, q11
		185	veor.8 \reg2, \reg2, q12
		186	.endm
		187
		188	fold64 q0, q1
		189	fold64 q2, q3
		190	fold64 q4, q5
		191	fold64 q6, q7
		192
		193	subs arg3, arg3, #128
		194
		195	// check if there is another 64B in the buffer to be able to fold
		196	bge _fold_64_B_loop
		197
		198	// at this point, the buffer pointer is pointing at the last y Bytes
		199	// of the buffer the 64B of folded data is in 4 of the vector
		200	// registers: v0, v1, v2, v3
		201
		202	// fold the 8 vector registers to 1 vector register with different
		203	// constants
		204
		205	adr ip, rk9
		206	vld1.64 {q10}, [ip, :128]!
		207
		208	.macro fold16, reg, rk
		209	vmull.p64 q8, \reg\()l, d20
		210	vmull.p64 \reg, \reg\()h, d21
		211	.ifnb \rk
		212	vld1.64 {q10}, [ip, :128]!
		213	.endif
		214	veor.8 q7, q7, q8
		215	veor.8 q7, q7, \reg
		216	.endm
		217
		218	fold16 q0, rk11
		219	fold16 q1, rk13
		220	fold16 q2, rk15
		221	fold16 q3, rk17
		222	fold16 q4, rk19
		223	fold16 q5, rk1
		224	fold16 q6
		225
		226	// instead of 64, we add 48 to the loop counter to save 1 instruction
		227	// from the loop instead of a cmp instruction, we use the negative
		228	// flag with the jl instruction
		229	adds arg3, arg3, #(128-16)
		230	blt _final_reduction_for_128
		231
		232	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
		233	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
		234	// continue folding 16B at a time
		235
		236	_16B_reduction_loop:
		237	vmull.p64 q8, d14, d20
		238	vmull.p64 q7, d15, d21
		239	veor.8 q7, q7, q8
		240
		241	vld1.64 {q0}, [arg2, :128]!
		242	CPU_LE( vrev64.8 q0, q0 )
		243	vswp d0, d1
		244	veor.8 q7, q7, q0
		245	subs arg3, arg3, #16
		246
		247	// instead of a cmp instruction, we utilize the flags with the
		248	// jge instruction equivalent of: cmp arg3, 16-16
		249	// check if there is any more 16B in the buffer to be able to fold
		250	bge _16B_reduction_loop
		251
		252	// now we have 16+z bytes left to reduce, where 0<= z < 16.
		253	// first, we reduce the data in the xmm7 register
		254
		255	_final_reduction_for_128:
		256	// check if any more data to fold. If not, compute the CRC of
		257	// the final 128 bits
		258	adds arg3, arg3, #16
		259	beq _128_done
		260
		261	// here we are getting data that is less than 16 bytes.
		262	// since we know that there was data before the pointer, we can
		263	// offset the input pointer before the actual point, to receive
		264	// exactly 16 bytes. after that the registers need to be adjusted.
		265	_get_last_two_regs:
		266	add arg2, arg2, arg3
		267	sub arg2, arg2, #16
		268	vld1.64 {q1}, [arg2]
		269	CPU_LE( vrev64.8 q1, q1 )
		270	vswp d2, d3
		271
		272	// get rid of the extra data that was loaded before
		273	// load the shift constant
		274	adr ip, tbl_shf_table + 16
		275	sub ip, ip, arg3
		276	vld1.8 {q0}, [ip]
		277
		278	// shift v2 to the left by arg3 bytes
		279	vtbl.8 d4, {d14-d15}, d0
		280	vtbl.8 d5, {d14-d15}, d1
		281
		282	// shift v7 to the right by 16-arg3 bytes
		283	vmov.i8 q9, #0x80
		284	veor.8 q0, q0, q9
		285	vtbl.8 d18, {d14-d15}, d0
		286	vtbl.8 d19, {d14-d15}, d1
		287
		288	// blend
		289	vshr.s8 q0, q0, #7 // convert to 8-bit mask
		290	vbsl.8 q0, q2, q1
		291
		292	// fold 16 Bytes
		293	vmull.p64 q8, d18, d20
		294	vmull.p64 q7, d19, d21
		295	veor.8 q7, q7, q8
		296	veor.8 q7, q7, q0
		297
		298	_128_done:
		299	// compute crc of a 128-bit value
		300	vldr d20, rk5
		301	vldr d21, rk6 // rk5 and rk6 in xmm10
		302
		303	// 64b fold
		304	vext.8 q0, qzr, q7, #8
		305	vmull.p64 q7, d15, d20
		306	veor.8 q7, q7, q0
		307
		308	// 32b fold
		309	vext.8 q0, q7, qzr, #12
		310	vmov s31, s3
		311	vmull.p64 q0, d0, d21
		312	veor.8 q7, q0, q7
		313
		314	// barrett reduction
		315	_barrett:
		316	vldr d20, rk7
		317	vldr d21, rk8
		318
		319	vmull.p64 q0, d15, d20
		320	vext.8 q0, qzr, q0, #12
		321	vmull.p64 q0, d1, d21
		322	vext.8 q0, qzr, q0, #12
		323	veor.8 q7, q7, q0
		324	vmov r0, s29
		325
		326	_cleanup:
		327	// scale the result back to 16 bits
		328	lsr r0, r0, #16
		329	bx lr
		330
		331	_less_than_128:
		332	teq arg3, #0
		333	beq _cleanup
		334
		335	vmov.i8 q0, #0
		336	vmov s3, arg1_low32 // get the initial crc value
		337
		338	vld1.64 {q7}, [arg2, :128]!
		339	CPU_LE( vrev64.8 q7, q7 )
		340	vswp d14, d15
		341	veor.8 q7, q7, q0
		342
		343	cmp arg3, #16
		344	beq _128_done // exactly 16 left
		345	blt _less_than_16_left
		346
		347	// now if there is, load the constants
		348	vldr d20, rk1
		349	vldr d21, rk2 // rk1 and rk2 in xmm10
		350
		351	// check if there is enough buffer to be able to fold 16B at a time
		352	subs arg3, arg3, #32
		353	addlt arg3, arg3, #16
		354	blt _get_last_two_regs
		355	b _16B_reduction_loop
		356
		357	_less_than_16_left:
		358	// shl r9, 4
		359	adr ip, tbl_shf_table + 16
		360	sub ip, ip, arg3
		361	vld1.8 {q0}, [ip]
		362	vmov.i8 q9, #0x80
		363	veor.8 q0, q0, q9
		364	vtbl.8 d18, {d14-d15}, d0
		365	vtbl.8 d15, {d14-d15}, d1
		366	vmov d14, d18
		367	b _128_done
		368	ENDPROC(crc_t10dif_pmull)
		369
		370	// precomputed constants
		371	// these constants are precomputed from the poly:
		372	// 0x8bb70000 (0x8bb7 scaled to 32 bits)
		373	.align 4
		374	// Q = 0x18BB70000
		375	// rk1 = 2^(32*3) mod Q << 32
		376	// rk2 = 2^(32*5) mod Q << 32
		377	// rk3 = 2^(32*15) mod Q << 32
		378	// rk4 = 2^(32*17) mod Q << 32
		379	// rk5 = 2^(32*3) mod Q << 32
		380	// rk6 = 2^(32*2) mod Q << 32
		381	// rk7 = floor(2^64/Q)
		382	// rk8 = Q
		383
		384	rk3: .quad 0x9d9d000000000000
		385	rk4: .quad 0x7cf5000000000000
		386	rk5: .quad 0x2d56000000000000
		387	rk6: .quad 0x1368000000000000
		388	rk7: .quad 0x00000001f65a57f8
		389	rk8: .quad 0x000000018bb70000
		390	rk9: .quad 0xceae000000000000
		391	rk10: .quad 0xbfd6000000000000
		392	rk11: .quad 0x1e16000000000000
		393	rk12: .quad 0x713c000000000000
		394	rk13: .quad 0xf7f9000000000000
		395	rk14: .quad 0x80a6000000000000
		396	rk15: .quad 0x044c000000000000
		397	rk16: .quad 0xe658000000000000
		398	rk17: .quad 0xad18000000000000
		399	rk18: .quad 0xa497000000000000
		400	rk19: .quad 0x6ee3000000000000
		401	rk20: .quad 0xe7b5000000000000
		402	rk1: .quad 0x2d56000000000000
		403	rk2: .quad 0x06df000000000000
		404
		405	tbl_shf_table:
		406	// use these values for shift constants for the tbl/tbx instruction
		407	// different alignments result in values as shown:
		408	// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
		409	// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
		410	// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
		411	// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
		412	// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
		413	// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
		414	// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
		415	// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
		416	// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
		417	// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
		418	// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
		419	// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
		420	// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
		421	// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
		422	// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
		423
		424	.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
		425	.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
		426	.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
		427	.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0


diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c new file mode 100644 index 000000000000..d428355cf38d --- /dev/null +++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,101 @@
		1	/*
		2	* Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
		3	*
		4	* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
		5	*
		6	* This program is free software; you can redistribute it and/or modify
		7	* it under the terms of the GNU General Public License version 2 as
		8	* published by the Free Software Foundation.
		9	*/
		10
		11	#include <linux/crc-t10dif.h>
		12	#include <linux/init.h>
		13	#include <linux/kernel.h>
		14	#include <linux/module.h>
		15	#include <linux/string.h>
		16
		17	#include <crypto/internal/hash.h>
		18
		19	#include <asm/neon.h>
		20	#include <asm/simd.h>
		21
		22	#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
		23
		24	asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len);
		25
		26	static int crct10dif_init(struct shash_desc *desc)
		27	{
		28	u16 *crc = shash_desc_ctx(desc);
		29
		30	*crc = 0;
		31	return 0;
		32	}
		33
		34	static int crct10dif_update(struct shash_desc desc, const u8 data,
		35	unsigned int length)
		36	{
		37	u16 *crc = shash_desc_ctx(desc);
		38	unsigned int l;
		39
		40	if (!may_use_simd()) {
		41	crc = crc_t10dif_generic(crc, data, length);
		42	} else {
		43	if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
		44	l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
		45	((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
		46
		47	crc = crc_t10dif_generic(crc, data, l);
		48
		49	length -= l;
		50	data += l;
		51	}
		52	if (length > 0) {
		53	kernel_neon_begin();
		54	crc = crc_t10dif_pmull(crc, data, length);
		55	kernel_neon_end();
		56	}
		57	}
		58	return 0;
		59	}
		60
		61	static int crct10dif_final(struct shash_desc desc, u8 out)
		62	{
		63	u16 *crc = shash_desc_ctx(desc);
		64
		65	(u16 )out = *crc;
		66	return 0;
		67	}
		68
		69	static struct shash_alg crc_t10dif_alg = {
		70	.digestsize = CRC_T10DIF_DIGEST_SIZE,
		71	.init = crct10dif_init,
		72	.update = crct10dif_update,
		73	.final = crct10dif_final,
		74	.descsize = CRC_T10DIF_DIGEST_SIZE,
		75
		76	.base.cra_name = "crct10dif",
		77	.base.cra_driver_name = "crct10dif-arm-ce",
		78	.base.cra_priority = 200,
		79	.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
		80	.base.cra_module = THIS_MODULE,
		81	};
		82
		83	static int __init crc_t10dif_mod_init(void)
		84	{
		85	if (!(elf_hwcap2 & HWCAP2_PMULL))
		86	return -ENODEV;
		87
		88	return crypto_register_shash(&crc_t10dif_alg);
		89	}
		90
		91	static void __exit crc_t10dif_mod_exit(void)
		92	{
		93	crypto_unregister_shash(&crc_t10dif_alg);
		94	}
		95
		96	module_init(crc_t10dif_mod_init);
		97	module_exit(crc_t10dif_mod_exit);
		98
		99	MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
		100	MODULE_LICENSE("GPL v2");
		101	MODULE_ALIAS_CRYPTO("crct10dif");