crypto: gf128mul - define gf128mul_x_* in gf128mul.h

The gf128mul_x_ble function is currently defined in gf128mul.c, because it depends on the gf128mul_table_be multiplication table. However, since the function is very small and only uses two values from the table, it is better for it to be defined as inline function in gf128mul.h. That way, the function can be inlined by the compiler for better performance. For consistency, the other gf128mul_x_* functions are also moved to the header file. In addition, the code is rewritten to be constant-time. After this change, the speed of the generic 'xts(aes)' implementation increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64 and CRYPTO_AES_NI_INTEL disabled). Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com> Reviewd-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Ondrej Mosnáček <omosnacek@gmail.com> 2017-04-02 15:19:13 -0400
committer: Herbert Xu <herbert@gondor.apana.org.au> 2017-04-05 09:58:35 -0400
commit: acb9b159c784dc0033ede0dadde876ebd93aca4c (patch)
tree: 4bc26c590b0fdcb517e996197958aa2654496f06 /include/crypto
parent: f275d3856cf597419293cd7d95aa628d3073f556 (diff)
1 files changed, 53 insertions, 2 deletions
diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h
index 0bc9b5f1c45e..35ced9db70ea 100644
--- a/include/crypto/gf128mul.h
+++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
 #ifndef _CRYPTO_GF128MUL_H
 #define _CRYPTO_GF128MUL_H
+#include <asm/byteorder.h>
 #include <crypto/b128ops.h>
 #include <linux/slab.h>
@@ -163,8 +164,58 @@ void gf128mul_lle(be128 *a, const be128 *b);
 void gf128mul_bbe(be128 *a, const be128 *b);
-/* multiply by x in ble format, needed by XTS */
+/*
-void gf128mul_x_ble(be128 *a, const be128 *b);
+ * The following functions multiply a field element by x in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
+ * correctly on both styles of machine.
+ *
+ * They are defined here for performance.
+ */
+static inline u64 gf128mul_mask_from_bit(u64 x, int which)
+{
+        /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
+        return ((s64)(x << (63 - which)) >> 63);
+}
+static inline void gf128mul_x_lle(be128 *r, const be128 *x)
+{
+        u64 a = be64_to_cpu(x->a);
+        u64 b = be64_to_cpu(x->b);
+        /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48
+         * (see crypto/gf128mul.c): */
+        u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56);
+        r->b = cpu_to_be64((b >> 1) | (a << 63));
+        r->a = cpu_to_be64((a >> 1) ^ _tt);
+}
+static inline void gf128mul_x_bbe(be128 *r, const be128 *x)
+{
+        u64 a = be64_to_cpu(x->a);
+        u64 b = be64_to_cpu(x->b);
+        /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
+        u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
+        r->a = cpu_to_be64((a << 1) | (b >> 63));
+        r->b = cpu_to_be64((b << 1) ^ _tt);
+}
+/* needed by XTS */
+static inline void gf128mul_x_ble(be128 *r, const be128 *x)
+{
+        u64 a = le64_to_cpu(x->a);
+        u64 b = le64_to_cpu(x->b);
+        /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
+        u64 _tt = gf128mul_mask_from_bit(b, 63) & 0x87;
+        r->a = cpu_to_le64((a << 1) ^ _tt);
+        r->b = cpu_to_le64((b << 1) | (a >> 63));
+}
 /* 4k table optimization */
author	Ondrej Mosnáček <omosnacek@gmail.com>	2017-04-02 15:19:13 -0400
committer	Herbert Xu <herbert@gondor.apana.org.au>	2017-04-05 09:58:35 -0400
commit	acb9b159c784dc0033ede0dadde876ebd93aca4c (patch)
tree	4bc26c590b0fdcb517e996197958aa2654496f06 /include/crypto
parent	f275d3856cf597419293cd7d95aa628d3073f556 (diff)

diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index 0bc9b5f1c45e..35ced9db70ea 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h
@@ -49,6 +49,7 @@
49	#ifndef _CRYPTO_GF128MUL_H	49	#ifndef _CRYPTO_GF128MUL_H
50	#define _CRYPTO_GF128MUL_H	50	#define _CRYPTO_GF128MUL_H
51		51
		52	#include <asm/byteorder.h>
52	#include <crypto/b128ops.h>	53	#include <crypto/b128ops.h>
53	#include <linux/slab.h>	54	#include <linux/slab.h>
54		55
@@ -163,8 +164,58 @@ void gf128mul_lle(be128 a, const be128 b);
163		164
164	void gf128mul_bbe(be128 a, const be128 b);	165	void gf128mul_bbe(be128 a, const be128 b);
165		166
166	/* multiply by x in ble format, needed by XTS */	167	/*
167	void gf128mul_x_ble(be128 a, const be128 b);	168	* The following functions multiply a field element by x in
		169	* the polynomial field representation. They use 64-bit word operations
		170	* to gain speed but compensate for machine endianness and hence work
		171	* correctly on both styles of machine.
		172	*
		173	* They are defined here for performance.
		174	*/
		175
		176	static inline u64 gf128mul_mask_from_bit(u64 x, int which)
		177	{
		178	/* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */
		179	return ((s64)(x << (63 - which)) >> 63);
		180	}
		181
		182	static inline void gf128mul_x_lle(be128 r, const be128 x)
		183	{
		184	u64 a = be64_to_cpu(x->a);
		185	u64 b = be64_to_cpu(x->b);
		186
		187	/* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48
		188	* (see crypto/gf128mul.c): */
		189	u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56);
		190
		191	r->b = cpu_to_be64((b >> 1) \| (a << 63));
		192	r->a = cpu_to_be64((a >> 1) ^ _tt);
		193	}
		194
		195	static inline void gf128mul_x_bbe(be128 r, const be128 x)
		196	{
		197	u64 a = be64_to_cpu(x->a);
		198	u64 b = be64_to_cpu(x->b);
		199
		200	/* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */
		201	u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87;
		202
		203	r->a = cpu_to_be64((a << 1) \| (b >> 63));
		204	r->b = cpu_to_be64((b << 1) ^ _tt);
		205	}
		206
		207	/* needed by XTS */
		208	static inline void gf128mul_x_ble(be128 r, const be128 x)
		209	{
		210	u64 a = le64_to_cpu(x->a);
		211	u64 b = le64_to_cpu(x->b);
		212
		213	/* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */
		214	u64 _tt = gf128mul_mask_from_bit(b, 63) & 0x87;
		215
		216	r->a = cpu_to_le64((a << 1) ^ _tt);
		217	r->b = cpu_to_le64((b << 1) \| (a >> 63));
		218	}
168		219
169	/* 4k table optimization */	220	/* 4k table optimization */
170		221