arm64/crypto: improve performance of GHASH algorithm

This patches modifies the GHASH secure hash implementation to switch to a faster, polynomial multiplication based reduction instead of one that uses shifts and rotates. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: Ard Biesheuvel <ard.biesheuvel@linaro.org> 2014-06-16 06:02:16 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-06-18 07:40:54 -0400
commit: b913a6404ce2b7d10a735834218d3c1e1bceff2a (patch)
tree: 5d2736f6de8a7600c99e471dfddc6d9a5ed43b2d /arch/arm64/crypto/ghash-ce-core.S
parent: 6aa8b209f5ef3610d470c519ddd6e6b47e9f6248 (diff)
1 files changed, 38 insertions, 54 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index b9e6eaf41c9b..dc457015884e 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -3,14 +3,6 @@
 *
 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
- * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
- *
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- *           Vinodh Gopal
- *           Erdinc Ozturk
- *           Deniz Karakoyunlu
- *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
@@ -19,13 +11,15 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-        DATA    .req    v0
+        SHASH   .req    v0
-        SHASH   .req    v1
+        SHASH2  .req    v1
-        IN1     .req    v2
        T1      .req    v2
        T2      .req    v3
-        T3      .req    v4
+        MASK    .req    v4
-        VZR     .req    v5
+        XL      .req    v5
+        XM      .req    v6
+        XH      .req    v7
+        IN1     .req    v7
        .text
        .arch           armv8-a+crypto
@@ -35,61 +29,51 @@
         *                         struct ghash_key const *k, const char *head)
         */
 ENTRY(pmull_ghash_update)
-        ld1             {DATA.16b}, [x1]
        ld1             {SHASH.16b}, [x3]
-        eor             VZR.16b, VZR.16b, VZR.16b
+        ld1             {XL.16b}, [x1]
+        movi            MASK.16b, #0xe1
+        ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
+        shl             MASK.2d, MASK.2d, #57
+        eor             SHASH2.16b, SHASH2.16b, SHASH.16b
        /* do the head block first, if supplied */
        cbz             x4, 0f
-        ld1             {IN1.2d}, [x4]
+        ld1             {T1.2d}, [x4]
        b               1f
-0:      ld1             {IN1.2d}, [x2], #16
+0:      ld1             {T1.2d}, [x2], #16
        sub             w0, w0, #1
-1:      ext             IN1.16b, IN1.16b, IN1.16b, #8
-CPU_LE( rev64           IN1.16b, IN1.16b        )
-        eor             DATA.16b, DATA.16b, IN1.16b
-        /* multiply DATA by SHASH in GF(2^128) */
+1:      /* multiply XL by SHASH in GF(2^128) */
-        ext             T2.16b, DATA.16b, DATA.16b, #8
+CPU_LE( rev64           T1.16b, T1.16b  )
-        ext             T3.16b, SHASH.16b, SHASH.16b, #8
-        eor             T2.16b, T2.16b, DATA.16b
-        eor             T3.16b, T3.16b, SHASH.16b
-        pmull2          T1.1q, SHASH.2d, DATA.2d        // a1 * b1
+        ext             T2.16b, XL.16b, XL.16b, #8
-        pmull           DATA.1q, SHASH.1d, DATA.1d      // a0 * b0
+        ext             IN1.16b, T1.16b, T1.16b, #8
-        pmull           T2.1q, T2.1d, T3.1d             // (a1 + a0)(b1 + b0)
+        eor             T1.16b, T1.16b, T2.16b
-        eor             T2.16b, T2.16b, T1.16b          // (a0 * b1) + (a1 * b0)
+        eor             XL.16b, XL.16b, IN1.16b
-        eor             T2.16b, T2.16b, DATA.16b
-        ext             T3.16b, VZR.16b, T2.16b, #8
+        pmull2          XH.1q, SHASH.2d, XL.2d          // a1 * b1
-        ext             T2.16b, T2.16b, VZR.16b, #8
+        eor             T1.16b, T1.16b, XL.16b
-        eor             DATA.16b, DATA.16b, T3.16b
+        pmull           XL.1q, SHASH.1d, XL.1d          // a0 * b0
-        eor             T1.16b, T1.16b, T2.16b  // <T1:DATA> is result of
+        pmull           XM.1q, SHASH2.1d, T1.1d         // (a1 + a0)(b1 + b0)
-                                                // carry-less multiplication
-        /* first phase of the reduction */
+        ext             T1.16b, XL.16b, XH.16b, #8
-        shl             T3.2d, DATA.2d, #1
+        eor             T2.16b, XL.16b, XH.16b
-        eor             T3.16b, T3.16b, DATA.16b
+        eor             XM.16b, XM.16b, T1.16b
-        shl             T3.2d, T3.2d, #5
+        eor             XM.16b, XM.16b, T2.16b
-        eor             T3.16b, T3.16b, DATA.16b
+        pmull           T2.1q, XL.1d, MASK.1d
-        shl             T3.2d, T3.2d, #57
-        ext             T2.16b, VZR.16b, T3.16b, #8
-        ext             T3.16b, T3.16b, VZR.16b, #8
-        eor             DATA.16b, DATA.16b, T2.16b
-        eor             T1.16b, T1.16b, T3.16b
-        /* second phase of the reduction */
+        mov             XH.d[0], XM.d[1]
-        ushr            T2.2d, DATA.2d, #5
+        mov             XM.d[1], XL.d[0]
-        eor             T2.16b, T2.16b, DATA.16b
-        ushr            T2.2d, T2.2d, #1
+        eor             XL.16b, XM.16b, T2.16b
-        eor             T2.16b, T2.16b, DATA.16b
+        ext             T2.16b, XL.16b, XL.16b, #8
-        ushr            T2.2d, T2.2d, #1
+        pmull           XL.1q, XL.1d, MASK.1d
-        eor             T1.16b, T1.16b, T2.16b
+        eor             T2.16b, T2.16b, XH.16b
-        eor             DATA.16b, DATA.16b, T1.16b
+        eor             XL.16b, XL.16b, T2.16b
        cbnz            w0, 0b
-        st1             {DATA.16b}, [x1]
+        st1             {XL.16b}, [x1]
        ret
 ENDPROC(pmull_ghash_update)
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>	2014-06-16 06:02:16 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-06-18 07:40:54 -0400
commit	b913a6404ce2b7d10a735834218d3c1e1bceff2a (patch)
tree	5d2736f6de8a7600c99e471dfddc6d9a5ed43b2d /arch/arm64/crypto/ghash-ce-core.S
parent	6aa8b209f5ef3610d470c519ddd6e6b47e9f6248 (diff)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index b9e6eaf41c9b..dc457015884e 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -3,14 +3,6 @@
3	*	3	*
4	* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>	4	* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
5	*	5	*
6	* Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
7	*
8	* Copyright (c) 2009 Intel Corp.
9	* Author: Huang Ying <ying.huang@intel.com>
10	* Vinodh Gopal
11	* Erdinc Ozturk
12	* Deniz Karakoyunlu
13	*
14	* This program is free software; you can redistribute it and/or modify it	6	* This program is free software; you can redistribute it and/or modify it
15	* under the terms of the GNU General Public License version 2 as published	7	* under the terms of the GNU General Public License version 2 as published
16	* by the Free Software Foundation.	8	* by the Free Software Foundation.
@@ -19,13 +11,15 @@
19	#include <linux/linkage.h>	11	#include <linux/linkage.h>
20	#include <asm/assembler.h>	12	#include <asm/assembler.h>
21		13
22	DATA .req v0	14	SHASH .req v0
23	SHASH .req v1	15	SHASH2 .req v1
24	IN1 .req v2
25	T1 .req v2	16	T1 .req v2
26	T2 .req v3	17	T2 .req v3
27	T3 .req v4	18	MASK .req v4
28	VZR .req v5	19	XL .req v5
		20	XM .req v6
		21	XH .req v7
		22	IN1 .req v7
29		23
30	.text	24	.text
31	.arch armv8-a+crypto	25	.arch armv8-a+crypto
@@ -35,61 +29,51 @@
35	* struct ghash_key const k, const char head)	29	* struct ghash_key const k, const char head)
36	*/	30	*/
37	ENTRY(pmull_ghash_update)	31	ENTRY(pmull_ghash_update)
38	ld1 {DATA.16b}, [x1]
39	ld1 {SHASH.16b}, [x3]	32	ld1 {SHASH.16b}, [x3]
40	eor VZR.16b, VZR.16b, VZR.16b	33	ld1 {XL.16b}, [x1]
		34	movi MASK.16b, #0xe1
		35	ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
		36	shl MASK.2d, MASK.2d, #57
		37	eor SHASH2.16b, SHASH2.16b, SHASH.16b
41		38
42	/* do the head block first, if supplied */	39	/* do the head block first, if supplied */
43	cbz x4, 0f	40	cbz x4, 0f
44	ld1 {IN1.2d}, [x4]	41	ld1 {T1.2d}, [x4]
45	b 1f	42	b 1f
46		43
47	0: ld1 {IN1.2d}, [x2], #16	44	0: ld1 {T1.2d}, [x2], #16
48	sub w0, w0, #1	45	sub w0, w0, #1
49	1: ext IN1.16b, IN1.16b, IN1.16b, #8
50	CPU_LE( rev64 IN1.16b, IN1.16b )
51	eor DATA.16b, DATA.16b, IN1.16b
52		46
53	/* multiply DATA by SHASH in GF(2^128) */	47	1: /* multiply XL by SHASH in GF(2^128) */
54	ext T2.16b, DATA.16b, DATA.16b, #8	48	CPU_LE( rev64 T1.16b, T1.16b )
55	ext T3.16b, SHASH.16b, SHASH.16b, #8
56	eor T2.16b, T2.16b, DATA.16b
57	eor T3.16b, T3.16b, SHASH.16b
58		49
59	pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1	50	ext T2.16b, XL.16b, XL.16b, #8
60	pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0	51	ext IN1.16b, T1.16b, T1.16b, #8
61	pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)	52	eor T1.16b, T1.16b, T2.16b
62	eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)	53	eor XL.16b, XL.16b, IN1.16b
63	eor T2.16b, T2.16b, DATA.16b
64		54
65	ext T3.16b, VZR.16b, T2.16b, #8	55	pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
66	ext T2.16b, T2.16b, VZR.16b, #8	56	eor T1.16b, T1.16b, XL.16b
67	eor DATA.16b, DATA.16b, T3.16b	57	pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
68	eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of	58	pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
69	// carry-less multiplication
70		59
71	/* first phase of the reduction */	60	ext T1.16b, XL.16b, XH.16b, #8
72	shl T3.2d, DATA.2d, #1	61	eor T2.16b, XL.16b, XH.16b
73	eor T3.16b, T3.16b, DATA.16b	62	eor XM.16b, XM.16b, T1.16b
74	shl T3.2d, T3.2d, #5	63	eor XM.16b, XM.16b, T2.16b
75	eor T3.16b, T3.16b, DATA.16b	64	pmull T2.1q, XL.1d, MASK.1d
76	shl T3.2d, T3.2d, #57
77	ext T2.16b, VZR.16b, T3.16b, #8
78	ext T3.16b, T3.16b, VZR.16b, #8
79	eor DATA.16b, DATA.16b, T2.16b
80	eor T1.16b, T1.16b, T3.16b
81		65
82	/* second phase of the reduction */	66	mov XH.d[0], XM.d[1]
83	ushr T2.2d, DATA.2d, #5	67	mov XM.d[1], XL.d[0]
84	eor T2.16b, T2.16b, DATA.16b	68
85	ushr T2.2d, T2.2d, #1	69	eor XL.16b, XM.16b, T2.16b
86	eor T2.16b, T2.16b, DATA.16b	70	ext T2.16b, XL.16b, XL.16b, #8
87	ushr T2.2d, T2.2d, #1	71	pmull XL.1q, XL.1d, MASK.1d
88	eor T1.16b, T1.16b, T2.16b	72	eor T2.16b, T2.16b, XH.16b
89	eor DATA.16b, DATA.16b, T1.16b	73	eor XL.16b, XL.16b, T2.16b
90		74
91	cbnz w0, 0b	75	cbnz w0, 0b
92		76
93	st1 {DATA.16b}, [x1]	77	st1 {XL.16b}, [x1]
94	ret	78	ret
95	ENDPROC(pmull_ghash_update)	79	ENDPROC(pmull_ghash_update)