diff options
author | Ard Biesheuvel <ard.biesheuvel@linaro.org> | 2019-02-26 06:36:18 -0500 |
---|---|---|
committer | Catalin Marinas <catalin.marinas@arm.com> | 2019-02-28 12:48:23 -0500 |
commit | 335ebe3035b6fcb83c3f225bc5135300fc24c827 (patch) | |
tree | 2f6f093da8a34e9969b6cab3dff85ef090b1363c /lib/raid6 | |
parent | 1ad3935b39da78a403e7df7a3813f866c731bc64 (diff) |
lib/raid6: arm: optimize away a mask operation in NEON recovery routine
The NEON recovery code was modeled after the x86 SIMD code, and for
some reason, that code uses a 16 bit wide signed shift and a mask to
perform what amounts to a 8 bit unsigned shift. So fold the ops
together.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to 'lib/raid6')
-rw-r--r-- | lib/raid6/recov_neon_inner.c | 12 |
1 files changed, 6 insertions, 6 deletions
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c index 7d00c31a6547..f13c07f82297 100644 --- a/lib/raid6/recov_neon_inner.c +++ b/lib/raid6/recov_neon_inner.c | |||
@@ -56,14 +56,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, | |||
56 | px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); | 56 | px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); |
57 | vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); | 57 | vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); |
58 | 58 | ||
59 | vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); | 59 | vy = vshrq_n_u8(vx, 4); |
60 | vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); | 60 | vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); |
61 | vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); | 61 | vy = vqtbl1q_u8(qm1, vy); |
62 | qx = veorq_u8(vx, vy); | 62 | qx = veorq_u8(vx, vy); |
63 | 63 | ||
64 | vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4); | 64 | vy = vshrq_n_u8(px, 4); |
65 | vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); | 65 | vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); |
66 | vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f)); | 66 | vy = vqtbl1q_u8(pm1, vy); |
67 | vx = veorq_u8(vx, vy); | 67 | vx = veorq_u8(vx, vy); |
68 | db = veorq_u8(vx, qx); | 68 | db = veorq_u8(vx, qx); |
69 | 69 | ||
@@ -97,9 +97,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, | |||
97 | 97 | ||
98 | vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); | 98 | vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); |
99 | 99 | ||
100 | vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); | 100 | vy = vshrq_n_u8(vx, 4); |
101 | vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); | 101 | vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); |
102 | vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); | 102 | vy = vqtbl1q_u8(qm1, vy); |
103 | vx = veorq_u8(vx, vy); | 103 | vx = veorq_u8(vx, vy); |
104 | vy = veorq_u8(vx, vld1q_u8(p)); | 104 | vy = veorq_u8(vx, vld1q_u8(p)); |
105 | 105 | ||