aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/ghash-ce-core.S
diff options
context:
space:
mode:
authorArd Biesheuvel <ard.biesheuvel@linaro.org>2014-06-16 06:02:16 -0400
committerCatalin Marinas <catalin.marinas@arm.com>2014-06-18 07:40:54 -0400
commitb913a6404ce2b7d10a735834218d3c1e1bceff2a (patch)
tree5d2736f6de8a7600c99e471dfddc6d9a5ed43b2d /arch/arm64/crypto/ghash-ce-core.S
parent6aa8b209f5ef3610d470c519ddd6e6b47e9f6248 (diff)
arm64/crypto: improve performance of GHASH algorithm
This patches modifies the GHASH secure hash implementation to switch to a faster, polynomial multiplication based reduction instead of one that uses shifts and rotates. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S92
1 files changed, 38 insertions, 54 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index b9e6eaf41c9b..dc457015884e 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -3,14 +3,6 @@
3 * 3 *
4 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> 4 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 * 5 *
6 * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published 7 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation. 8 * by the Free Software Foundation.
@@ -19,13 +11,15 @@
19#include <linux/linkage.h> 11#include <linux/linkage.h>
20#include <asm/assembler.h> 12#include <asm/assembler.h>
21 13
22 DATA .req v0 14 SHASH .req v0
23 SHASH .req v1 15 SHASH2 .req v1
24 IN1 .req v2
25 T1 .req v2 16 T1 .req v2
26 T2 .req v3 17 T2 .req v3
27 T3 .req v4 18 MASK .req v4
28 VZR .req v5 19 XL .req v5
20 XM .req v6
21 XH .req v7
22 IN1 .req v7
29 23
30 .text 24 .text
31 .arch armv8-a+crypto 25 .arch armv8-a+crypto
@@ -35,61 +29,51 @@
35 * struct ghash_key const *k, const char *head) 29 * struct ghash_key const *k, const char *head)
36 */ 30 */
37ENTRY(pmull_ghash_update) 31ENTRY(pmull_ghash_update)
38 ld1 {DATA.16b}, [x1]
39 ld1 {SHASH.16b}, [x3] 32 ld1 {SHASH.16b}, [x3]
40 eor VZR.16b, VZR.16b, VZR.16b 33 ld1 {XL.16b}, [x1]
34 movi MASK.16b, #0xe1
35 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
36 shl MASK.2d, MASK.2d, #57
37 eor SHASH2.16b, SHASH2.16b, SHASH.16b
41 38
42 /* do the head block first, if supplied */ 39 /* do the head block first, if supplied */
43 cbz x4, 0f 40 cbz x4, 0f
44 ld1 {IN1.2d}, [x4] 41 ld1 {T1.2d}, [x4]
45 b 1f 42 b 1f
46 43
470: ld1 {IN1.2d}, [x2], #16 440: ld1 {T1.2d}, [x2], #16
48 sub w0, w0, #1 45 sub w0, w0, #1
491: ext IN1.16b, IN1.16b, IN1.16b, #8
50CPU_LE( rev64 IN1.16b, IN1.16b )
51 eor DATA.16b, DATA.16b, IN1.16b
52 46
53 /* multiply DATA by SHASH in GF(2^128) */ 471: /* multiply XL by SHASH in GF(2^128) */
54 ext T2.16b, DATA.16b, DATA.16b, #8 48CPU_LE( rev64 T1.16b, T1.16b )
55 ext T3.16b, SHASH.16b, SHASH.16b, #8
56 eor T2.16b, T2.16b, DATA.16b
57 eor T3.16b, T3.16b, SHASH.16b
58 49
59 pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 50 ext T2.16b, XL.16b, XL.16b, #8
60 pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 51 ext IN1.16b, T1.16b, T1.16b, #8
61 pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) 52 eor T1.16b, T1.16b, T2.16b
62 eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) 53 eor XL.16b, XL.16b, IN1.16b
63 eor T2.16b, T2.16b, DATA.16b
64 54
65 ext T3.16b, VZR.16b, T2.16b, #8 55 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
66 ext T2.16b, T2.16b, VZR.16b, #8 56 eor T1.16b, T1.16b, XL.16b
67 eor DATA.16b, DATA.16b, T3.16b 57 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
68 eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of 58 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
69 // carry-less multiplication
70 59
71 /* first phase of the reduction */ 60 ext T1.16b, XL.16b, XH.16b, #8
72 shl T3.2d, DATA.2d, #1 61 eor T2.16b, XL.16b, XH.16b
73 eor T3.16b, T3.16b, DATA.16b 62 eor XM.16b, XM.16b, T1.16b
74 shl T3.2d, T3.2d, #5 63 eor XM.16b, XM.16b, T2.16b
75 eor T3.16b, T3.16b, DATA.16b 64 pmull T2.1q, XL.1d, MASK.1d
76 shl T3.2d, T3.2d, #57
77 ext T2.16b, VZR.16b, T3.16b, #8
78 ext T3.16b, T3.16b, VZR.16b, #8
79 eor DATA.16b, DATA.16b, T2.16b
80 eor T1.16b, T1.16b, T3.16b
81 65
82 /* second phase of the reduction */ 66 mov XH.d[0], XM.d[1]
83 ushr T2.2d, DATA.2d, #5 67 mov XM.d[1], XL.d[0]
84 eor T2.16b, T2.16b, DATA.16b 68
85 ushr T2.2d, T2.2d, #1 69 eor XL.16b, XM.16b, T2.16b
86 eor T2.16b, T2.16b, DATA.16b 70 ext T2.16b, XL.16b, XL.16b, #8
87 ushr T2.2d, T2.2d, #1 71 pmull XL.1q, XL.1d, MASK.1d
88 eor T1.16b, T1.16b, T2.16b 72 eor T2.16b, T2.16b, XH.16b
89 eor DATA.16b, DATA.16b, T1.16b 73 eor XL.16b, XL.16b, T2.16b
90 74
91 cbnz w0, 0b 75 cbnz w0, 0b
92 76
93 st1 {DATA.16b}, [x1] 77 st1 {XL.16b}, [x1]
94 ret 78 ret
95ENDPROC(pmull_ghash_update) 79ENDPROC(pmull_ghash_update)