diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2014-06-16 06:02:16 -0400 |
---|---|---|
committer | Catalin Marinas <catalin.marinas@arm.com> | 2014-06-18 07:40:54 -0400 |
commit | b913a6404ce2b7d10a735834218d3c1e1bceff2a (patch) | |
tree | 5d2736f6de8a7600c99e471dfddc6d9a5ed43b2d /arch/arm64/crypto/ghash-ce-core.S | |
parent | 6aa8b209f5ef3610d470c519ddd6e6b47e9f6248 (diff) |
arm64/crypto: improve performance of GHASH algorithm
This patches modifies the GHASH secure hash implementation to switch to a
faster, polynomial multiplication based reduction instead of one that uses
shifts and rotates.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 92 |
1 files changed, 38 insertions, 54 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index b9e6eaf41c9b..dc457015884e 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S | |||
@@ -3,14 +3,6 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | 4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> |
5 | * | 5 | * |
6 | * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S | ||
7 | * | ||
8 | * Copyright (c) 2009 Intel Corp. | ||
9 | * Author: Huang Ying <ying.huang@intel.com> | ||
10 | * Vinodh Gopal | ||
11 | * Erdinc Ozturk | ||
12 | * Deniz Karakoyunlu | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
15 | * under the terms of the GNU General Public License version 2 as published | 7 | * under the terms of the GNU General Public License version 2 as published |
16 | * by the Free Software Foundation. | 8 | * by the Free Software Foundation. |
@@ -19,13 +11,15 @@ | |||
19 | #include <linux/linkage.h> | 11 | #include <linux/linkage.h> |
20 | #include <asm/assembler.h> | 12 | #include <asm/assembler.h> |
21 | 13 | ||
22 | DATA .req v0 | 14 | SHASH .req v0 |
23 | SHASH .req v1 | 15 | SHASH2 .req v1 |
24 | IN1 .req v2 | ||
25 | T1 .req v2 | 16 | T1 .req v2 |
26 | T2 .req v3 | 17 | T2 .req v3 |
27 | T3 .req v4 | 18 | MASK .req v4 |
28 | VZR .req v5 | 19 | XL .req v5 |
20 | XM .req v6 | ||
21 | XH .req v7 | ||
22 | IN1 .req v7 | ||
29 | 23 | ||
30 | .text | 24 | .text |
31 | .arch armv8-a+crypto | 25 | .arch armv8-a+crypto |
@@ -35,61 +29,51 @@ | |||
35 | * struct ghash_key const *k, const char *head) | 29 | * struct ghash_key const *k, const char *head) |
36 | */ | 30 | */ |
37 | ENTRY(pmull_ghash_update) | 31 | ENTRY(pmull_ghash_update) |
38 | ld1 {DATA.16b}, [x1] | ||
39 | ld1 {SHASH.16b}, [x3] | 32 | ld1 {SHASH.16b}, [x3] |
40 | eor VZR.16b, VZR.16b, VZR.16b | 33 | ld1 {XL.16b}, [x1] |
34 | movi MASK.16b, #0xe1 | ||
35 | ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 | ||
36 | shl MASK.2d, MASK.2d, #57 | ||
37 | eor SHASH2.16b, SHASH2.16b, SHASH.16b | ||
41 | 38 | ||
42 | /* do the head block first, if supplied */ | 39 | /* do the head block first, if supplied */ |
43 | cbz x4, 0f | 40 | cbz x4, 0f |
44 | ld1 {IN1.2d}, [x4] | 41 | ld1 {T1.2d}, [x4] |
45 | b 1f | 42 | b 1f |
46 | 43 | ||
47 | 0: ld1 {IN1.2d}, [x2], #16 | 44 | 0: ld1 {T1.2d}, [x2], #16 |
48 | sub w0, w0, #1 | 45 | sub w0, w0, #1 |
49 | 1: ext IN1.16b, IN1.16b, IN1.16b, #8 | ||
50 | CPU_LE( rev64 IN1.16b, IN1.16b ) | ||
51 | eor DATA.16b, DATA.16b, IN1.16b | ||
52 | 46 | ||
53 | /* multiply DATA by SHASH in GF(2^128) */ | 47 | 1: /* multiply XL by SHASH in GF(2^128) */ |
54 | ext T2.16b, DATA.16b, DATA.16b, #8 | 48 | CPU_LE( rev64 T1.16b, T1.16b ) |
55 | ext T3.16b, SHASH.16b, SHASH.16b, #8 | ||
56 | eor T2.16b, T2.16b, DATA.16b | ||
57 | eor T3.16b, T3.16b, SHASH.16b | ||
58 | 49 | ||
59 | pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 | 50 | ext T2.16b, XL.16b, XL.16b, #8 |
60 | pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 | 51 | ext IN1.16b, T1.16b, T1.16b, #8 |
61 | pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) | 52 | eor T1.16b, T1.16b, T2.16b |
62 | eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) | 53 | eor XL.16b, XL.16b, IN1.16b |
63 | eor T2.16b, T2.16b, DATA.16b | ||
64 | 54 | ||
65 | ext T3.16b, VZR.16b, T2.16b, #8 | 55 | pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 |
66 | ext T2.16b, T2.16b, VZR.16b, #8 | 56 | eor T1.16b, T1.16b, XL.16b |
67 | eor DATA.16b, DATA.16b, T3.16b | 57 | pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 |
68 | eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of | 58 | pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) |
69 | // carry-less multiplication | ||
70 | 59 | ||
71 | /* first phase of the reduction */ | 60 | ext T1.16b, XL.16b, XH.16b, #8 |
72 | shl T3.2d, DATA.2d, #1 | 61 | eor T2.16b, XL.16b, XH.16b |
73 | eor T3.16b, T3.16b, DATA.16b | 62 | eor XM.16b, XM.16b, T1.16b |
74 | shl T3.2d, T3.2d, #5 | 63 | eor XM.16b, XM.16b, T2.16b |
75 | eor T3.16b, T3.16b, DATA.16b | 64 | pmull T2.1q, XL.1d, MASK.1d |
76 | shl T3.2d, T3.2d, #57 | ||
77 | ext T2.16b, VZR.16b, T3.16b, #8 | ||
78 | ext T3.16b, T3.16b, VZR.16b, #8 | ||
79 | eor DATA.16b, DATA.16b, T2.16b | ||
80 | eor T1.16b, T1.16b, T3.16b | ||
81 | 65 | ||
82 | /* second phase of the reduction */ | 66 | mov XH.d[0], XM.d[1] |
83 | ushr T2.2d, DATA.2d, #5 | 67 | mov XM.d[1], XL.d[0] |
84 | eor T2.16b, T2.16b, DATA.16b | 68 | |
85 | ushr T2.2d, T2.2d, #1 | 69 | eor XL.16b, XM.16b, T2.16b |
86 | eor T2.16b, T2.16b, DATA.16b | 70 | ext T2.16b, XL.16b, XL.16b, #8 |
87 | ushr T2.2d, T2.2d, #1 | 71 | pmull XL.1q, XL.1d, MASK.1d |
88 | eor T1.16b, T1.16b, T2.16b | 72 | eor T2.16b, T2.16b, XH.16b |
89 | eor DATA.16b, DATA.16b, T1.16b | 73 | eor XL.16b, XL.16b, T2.16b |
90 | 74 | ||
91 | cbnz w0, 0b | 75 | cbnz w0, 0b |
92 | 76 | ||
93 | st1 {DATA.16b}, [x1] | 77 | st1 {XL.16b}, [x1] |
94 | ret | 78 | ret |
95 | ENDPROC(pmull_ghash_update) | 79 | ENDPROC(pmull_ghash_update) |