lib/raid6: arm: optimize away a mask operation in NEON recovery routine

The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: Ard Biesheuvel <ard.biesheuvel@linaro.org> 2019-02-26 06:36:18 -0500
committer: Catalin Marinas <catalin.marinas@arm.com> 2019-02-28 12:48:23 -0500
commit: 335ebe3035b6fcb83c3f225bc5135300fc24c827 (patch)
tree: 2f6f093da8a34e9969b6cab3dff85ef090b1363c /lib/raid6
parent: 1ad3935b39da78a403e7df7a3813f866c731bc64 (diff)
1 files changed, 6 insertions, 6 deletions
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
index 7d00c31a6547..f13c07f82297 100644
--- a/lib/raid6/recov_neon_inner.c
+++ b/lib/raid6/recov_neon_inner.c
@@ -56,14 +56,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
                px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
                vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
-                vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+                vy = vshrq_n_u8(vx, 4);
                vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
-                vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+                vy = vqtbl1q_u8(qm1, vy);
                qx = veorq_u8(vx, vy);
-                vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+                vy = vshrq_n_u8(px, 4);
                vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
-                vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+                vy = vqtbl1q_u8(pm1, vy);
                vx = veorq_u8(vx, vy);
                db = veorq_u8(vx, qx);
@@ -97,9 +97,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
                vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
-                vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+                vy = vshrq_n_u8(vx, 4);
                vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
-                vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+                vy = vqtbl1q_u8(qm1, vy);
                vx = veorq_u8(vx, vy);
                vy = veorq_u8(vx, vld1q_u8(p));
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>	2019-02-26 06:36:18 -0500
committer	Catalin Marinas <catalin.marinas@arm.com>	2019-02-28 12:48:23 -0500
commit	335ebe3035b6fcb83c3f225bc5135300fc24c827 (patch)
tree	2f6f093da8a34e9969b6cab3dff85ef090b1363c /lib/raid6
parent	1ad3935b39da78a403e7df7a3813f866c731bc64 (diff)

diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c index 7d00c31a6547..f13c07f82297 100644 --- a/lib/raid6/recov_neon_inner.c +++ b/lib/raid6/recov_neon_inner.c
@@ -56,14 +56,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dp,
56	px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));	56	px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
57	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));	57	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
58		58
59	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);	59	vy = vshrq_n_u8(vx, 4);
60	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));	60	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
61	vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));	61	vy = vqtbl1q_u8(qm1, vy);
62	qx = veorq_u8(vx, vy);	62	qx = veorq_u8(vx, vy);
63		63
64	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);	64	vy = vshrq_n_u8(px, 4);
65	vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));	65	vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
66	vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));	66	vy = vqtbl1q_u8(pm1, vy);
67	vx = veorq_u8(vx, vy);	67	vx = veorq_u8(vx, vy);
68	db = veorq_u8(vx, qx);	68	db = veorq_u8(vx, qx);
69		69
@@ -97,9 +97,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dq,
97		97
98	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));	98	vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
99		99
100	vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);	100	vy = vshrq_n_u8(vx, 4);
101	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));	101	vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
102	vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));	102	vy = vqtbl1q_u8(qm1, vy);
103	vx = veorq_u8(vx, vy);	103	vx = veorq_u8(vx, vy);
104	vy = veorq_u8(vx, vld1q_u8(p));	104	vy = veorq_u8(vx, vld1q_u8(p));
105		105