diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 21:07:32 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 21:07:32 -0400 |
| commit | 2e032852245b3dcfe5461d7353e34eb6da095ccf (patch) | |
| tree | 69f9fdf03b54d76bb539096e0ec96e91ea8216b1 /lib | |
| parent | 356f9e74ffaafd11741589a9aa21d6c9d2721417 (diff) | |
| parent | 141b97433d77e39ac3ac111a7b3852192035259c (diff) | |
Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm
Pull ARM updates from Russell King:
"This set includes adding support for Neon acceleration of RAID6 XOR
code from Ard Biesheuvel, cache flushing and barrier updates from Will
Deacon, and a cleanup to the ARM debug code which reduces the amount
of code by about 500 lines.
A few other cleanups, such as constifying the machine descriptors
which already shouldn't be written to, cleaning up the printing of the
L2 cache size"
* 'for-linus' of git://git.linaro.org/people/rmk/linux-arm: (55 commits)
ARM: 7826/1: debug: support debug ll on hisilicon soc
ARM: 7830/1: delay: don't bother reporting bogomips in /proc/cpuinfo
ARM: 7829/1: Add ".text.unlikely" and ".text.hot" to arm unwind tables
ARM: 7828/1: ARMv7-M: implement restart routine common to all v7-M machines
ARM: 7827/1: highbank: fix debug uart virtual address for LPAE
ARM: 7823/1: errata: workaround Cortex-A15 erratum 773022
ARM: 7806/1: allow DEBUG_UNCOMPRESS for Tegra
ARM: 7793/1: debug: use generic option for ep93xx PL10x debug port
ARM: debug: move SPEAr debug to generic PL01x code
ARM: debug: move davinci debug to generic 8250 code
ARM: debug: move keystone debug to generic 8250 code
ARM: debug: remove DEBUG_ROCKCHIP_UART
ARM: debug: provide generic option choices for 8250 and PL01x ports
ARM: debug: move PL01X debug include into arch/arm/include/debug/
ARM: debug: provide PL01x debug uart phys/virt address configuration options
ARM: debug: add support for word accesses to debug/8250.S
ARM: debug: move 8250 debug include into arch/arm/include/debug/
ARM: debug: provide 8250 debug uart phys/virt address configuration options
ARM: debug: provide 8250 debug uart register shift configuration option
ARM: debug: provide 8250 debug uart flow control configuration option
...
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/raid6/.gitignore | 1 | ||||
| -rw-r--r-- | lib/raid6/Makefile | 40 | ||||
| -rw-r--r-- | lib/raid6/algos.c | 6 | ||||
| -rw-r--r-- | lib/raid6/neon.c | 58 | ||||
| -rw-r--r-- | lib/raid6/neon.uc | 80 | ||||
| -rw-r--r-- | lib/raid6/test/Makefile | 26 |
6 files changed, 210 insertions, 1 deletions
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 162becacf97c..0a7e494b2bcd 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore | |||
| @@ -2,3 +2,4 @@ mktables | |||
| 2 | altivec*.c | 2 | altivec*.c |
| 3 | int*.c | 3 | int*.c |
| 4 | tables.c | 4 | tables.c |
| 5 | neon?.c | ||
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 9f7c184725d7..b4625787c7ee 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile | |||
| @@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ | |||
| 5 | 5 | ||
| 6 | raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o | 6 | raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o |
| 7 | raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o | 7 | raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o |
| 8 | raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o | ||
| 8 | 9 | ||
| 9 | hostprogs-y += mktables | 10 | hostprogs-y += mktables |
| 10 | 11 | ||
| @@ -16,6 +17,21 @@ ifeq ($(CONFIG_ALTIVEC),y) | |||
| 16 | altivec_flags := -maltivec -mabi=altivec | 17 | altivec_flags := -maltivec -mabi=altivec |
| 17 | endif | 18 | endif |
| 18 | 19 | ||
| 20 | # The GCC option -ffreestanding is required in order to compile code containing | ||
| 21 | # ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) | ||
| 22 | ifeq ($(CONFIG_KERNEL_MODE_NEON),y) | ||
| 23 | NEON_FLAGS := -ffreestanding | ||
| 24 | ifeq ($(ARCH),arm) | ||
| 25 | NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon | ||
| 26 | endif | ||
| 27 | ifeq ($(ARCH),arm64) | ||
| 28 | CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only | ||
| 29 | CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only | ||
| 30 | CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only | ||
| 31 | CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only | ||
| 32 | endif | ||
| 33 | endif | ||
| 34 | |||
| 19 | targets += int1.c | 35 | targets += int1.c |
| 20 | $(obj)/int1.c: UNROLL := 1 | 36 | $(obj)/int1.c: UNROLL := 1 |
| 21 | $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE | 37 | $(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE |
| @@ -70,6 +86,30 @@ $(obj)/altivec8.c: UNROLL := 8 | |||
| 70 | $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE | 86 | $(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE |
| 71 | $(call if_changed,unroll) | 87 | $(call if_changed,unroll) |
| 72 | 88 | ||
| 89 | CFLAGS_neon1.o += $(NEON_FLAGS) | ||
| 90 | targets += neon1.c | ||
| 91 | $(obj)/neon1.c: UNROLL := 1 | ||
| 92 | $(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
| 93 | $(call if_changed,unroll) | ||
| 94 | |||
| 95 | CFLAGS_neon2.o += $(NEON_FLAGS) | ||
| 96 | targets += neon2.c | ||
| 97 | $(obj)/neon2.c: UNROLL := 2 | ||
| 98 | $(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
| 99 | $(call if_changed,unroll) | ||
| 100 | |||
| 101 | CFLAGS_neon4.o += $(NEON_FLAGS) | ||
| 102 | targets += neon4.c | ||
| 103 | $(obj)/neon4.c: UNROLL := 4 | ||
| 104 | $(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
| 105 | $(call if_changed,unroll) | ||
| 106 | |||
| 107 | CFLAGS_neon8.o += $(NEON_FLAGS) | ||
| 108 | targets += neon8.c | ||
| 109 | $(obj)/neon8.c: UNROLL := 8 | ||
| 110 | $(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE | ||
| 111 | $(call if_changed,unroll) | ||
| 112 | |||
| 73 | quiet_cmd_mktable = TABLE $@ | 113 | quiet_cmd_mktable = TABLE $@ |
| 74 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) | 114 | cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) |
| 75 | 115 | ||
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 6d7316fe9f30..74e6f5629dbc 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c | |||
| @@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = { | |||
| 70 | &raid6_intx2, | 70 | &raid6_intx2, |
| 71 | &raid6_intx4, | 71 | &raid6_intx4, |
| 72 | &raid6_intx8, | 72 | &raid6_intx8, |
| 73 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
| 74 | &raid6_neonx1, | ||
| 75 | &raid6_neonx2, | ||
| 76 | &raid6_neonx4, | ||
| 77 | &raid6_neonx8, | ||
| 78 | #endif | ||
| 73 | NULL | 79 | NULL |
| 74 | }; | 80 | }; |
| 75 | 81 | ||
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 000000000000..36ad4705df1a --- /dev/null +++ b/lib/raid6/neon.c | |||
| @@ -0,0 +1,58 @@ | |||
| 1 | /* | ||
| 2 | * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics | ||
| 3 | * | ||
| 4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/raid/pq.h> | ||
| 12 | |||
| 13 | #ifdef __KERNEL__ | ||
| 14 | #include <asm/neon.h> | ||
| 15 | #else | ||
| 16 | #define kernel_neon_begin() | ||
| 17 | #define kernel_neon_end() | ||
| 18 | #define cpu_has_neon() (1) | ||
| 19 | #endif | ||
| 20 | |||
| 21 | /* | ||
| 22 | * There are 2 reasons these wrappers are kept in a separate compilation unit | ||
| 23 | * from the actual implementations in neonN.c (generated from neon.uc by | ||
| 24 | * unroll.awk): | ||
| 25 | * - the actual implementations use NEON intrinsics, and the GCC support header | ||
| 26 | * (arm_neon.h) is not fully compatible (type wise) with the kernel; | ||
| 27 | * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, | ||
| 28 | * and we have to make sure that we never use *any* NEON/VFP instructions | ||
| 29 | * outside a kernel_neon_begin()/kernel_neon_end() pair. | ||
| 30 | */ | ||
| 31 | |||
| 32 | #define RAID6_NEON_WRAPPER(_n) \ | ||
| 33 | static void raid6_neon ## _n ## _gen_syndrome(int disks, \ | ||
| 34 | size_t bytes, void **ptrs) \ | ||
| 35 | { \ | ||
| 36 | void raid6_neon ## _n ## _gen_syndrome_real(int, \ | ||
| 37 | unsigned long, void**); \ | ||
| 38 | kernel_neon_begin(); \ | ||
| 39 | raid6_neon ## _n ## _gen_syndrome_real(disks, \ | ||
| 40 | (unsigned long)bytes, ptrs); \ | ||
| 41 | kernel_neon_end(); \ | ||
| 42 | } \ | ||
| 43 | struct raid6_calls const raid6_neonx ## _n = { \ | ||
| 44 | raid6_neon ## _n ## _gen_syndrome, \ | ||
| 45 | raid6_have_neon, \ | ||
| 46 | "neonx" #_n, \ | ||
| 47 | 0 \ | ||
| 48 | } | ||
| 49 | |||
| 50 | static int raid6_have_neon(void) | ||
| 51 | { | ||
| 52 | return cpu_has_neon(); | ||
| 53 | } | ||
| 54 | |||
| 55 | RAID6_NEON_WRAPPER(1); | ||
| 56 | RAID6_NEON_WRAPPER(2); | ||
| 57 | RAID6_NEON_WRAPPER(4); | ||
| 58 | RAID6_NEON_WRAPPER(8); | ||
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 000000000000..1b9ed793342d --- /dev/null +++ b/lib/raid6/neon.uc | |||
| @@ -0,0 +1,80 @@ | |||
| 1 | /* ----------------------------------------------------------------------- | ||
| 2 | * | ||
| 3 | * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions | ||
| 4 | * | ||
| 5 | * Copyright (C) 2012 Rob Herring | ||
| 6 | * | ||
| 7 | * Based on altivec.uc: | ||
| 8 | * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved | ||
| 9 | * | ||
| 10 | * This program is free software; you can redistribute it and/or modify | ||
| 11 | * it under the terms of the GNU General Public License as published by | ||
| 12 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
| 13 | * Boston MA 02111-1307, USA; either version 2 of the License, or | ||
| 14 | * (at your option) any later version; incorporated herein by reference. | ||
| 15 | * | ||
| 16 | * ----------------------------------------------------------------------- */ | ||
| 17 | |||
| 18 | /* | ||
| 19 | * neon$#.c | ||
| 20 | * | ||
| 21 | * $#-way unrolled NEON intrinsics math RAID-6 instruction set | ||
| 22 | * | ||
| 23 | * This file is postprocessed using unroll.awk | ||
| 24 | */ | ||
| 25 | |||
| 26 | #include <arm_neon.h> | ||
| 27 | |||
| 28 | typedef uint8x16_t unative_t; | ||
| 29 | |||
| 30 | #define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) | ||
| 31 | #define NSIZE sizeof(unative_t) | ||
| 32 | |||
| 33 | /* | ||
| 34 | * The SHLBYTE() operation shifts each byte left by 1, *not* | ||
| 35 | * rolling over into the next byte | ||
| 36 | */ | ||
| 37 | static inline unative_t SHLBYTE(unative_t v) | ||
| 38 | { | ||
| 39 | return vshlq_n_u8(v, 1); | ||
| 40 | } | ||
| 41 | |||
| 42 | /* | ||
| 43 | * The MASK() operation returns 0xFF in any byte for which the high | ||
| 44 | * bit is 1, 0x00 for any byte for which the high bit is 0. | ||
| 45 | */ | ||
| 46 | static inline unative_t MASK(unative_t v) | ||
| 47 | { | ||
| 48 | const uint8x16_t temp = NBYTES(0); | ||
| 49 | return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); | ||
| 50 | } | ||
| 51 | |||
| 52 | void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) | ||
| 53 | { | ||
| 54 | uint8_t **dptr = (uint8_t **)ptrs; | ||
| 55 | uint8_t *p, *q; | ||
| 56 | int d, z, z0; | ||
| 57 | |||
| 58 | register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; | ||
| 59 | const unative_t x1d = NBYTES(0x1d); | ||
| 60 | |||
| 61 | z0 = disks - 3; /* Highest data disk */ | ||
| 62 | p = dptr[z0+1]; /* XOR parity */ | ||
| 63 | q = dptr[z0+2]; /* RS syndrome */ | ||
| 64 | |||
| 65 | for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { | ||
| 66 | wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); | ||
| 67 | for ( z = z0-1 ; z >= 0 ; z-- ) { | ||
| 68 | wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); | ||
| 69 | wp$$ = veorq_u8(wp$$, wd$$); | ||
| 70 | w2$$ = MASK(wq$$); | ||
| 71 | w1$$ = SHLBYTE(wq$$); | ||
| 72 | |||
| 73 | w2$$ = vandq_u8(w2$$, x1d); | ||
| 74 | w1$$ = veorq_u8(w1$$, w2$$); | ||
| 75 | wq$$ = veorq_u8(w1$$, wd$$); | ||
| 76 | } | ||
| 77 | vst1q_u8(&p[d+NSIZE*$$], wp$$); | ||
| 78 | vst1q_u8(&q[d+NSIZE*$$], wq$$); | ||
| 79 | } | ||
| 80 | } | ||
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 087332dbf8aa..28afa1a06e03 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile | |||
| @@ -22,11 +22,23 @@ ifeq ($(ARCH),x86_64) | |||
| 22 | IS_X86 = yes | 22 | IS_X86 = yes |
| 23 | endif | 23 | endif |
| 24 | 24 | ||
| 25 | ifeq ($(ARCH),arm) | ||
| 26 | CFLAGS += -I../../../arch/arm/include -mfpu=neon | ||
| 27 | HAS_NEON = yes | ||
| 28 | endif | ||
| 29 | ifeq ($(ARCH),arm64) | ||
| 30 | CFLAGS += -I../../../arch/arm64/include | ||
| 31 | HAS_NEON = yes | ||
| 32 | endif | ||
| 33 | |||
| 25 | ifeq ($(IS_X86),yes) | 34 | ifeq ($(IS_X86),yes) |
| 26 | OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o | 35 | OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o |
| 27 | CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ | 36 | CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ |
| 28 | gcc -c -x assembler - >&/dev/null && \ | 37 | gcc -c -x assembler - >&/dev/null && \ |
| 29 | rm ./-.o && echo -DCONFIG_AS_AVX2=1) | 38 | rm ./-.o && echo -DCONFIG_AS_AVX2=1) |
| 39 | else ifeq ($(HAS_NEON),yes) | ||
| 40 | OBJS += neon.o neon1.o neon2.o neon4.o neon8.o | ||
| 41 | CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 | ||
| 30 | else | 42 | else |
| 31 | HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ | 43 | HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\ |
| 32 | gcc -c -x c - >&/dev/null && \ | 44 | gcc -c -x c - >&/dev/null && \ |
| @@ -55,6 +67,18 @@ raid6.a: $(OBJS) | |||
| 55 | raid6test: test.c raid6.a | 67 | raid6test: test.c raid6.a |
| 56 | $(CC) $(CFLAGS) -o raid6test $^ | 68 | $(CC) $(CFLAGS) -o raid6test $^ |
| 57 | 69 | ||
| 70 | neon1.c: neon.uc ../unroll.awk | ||
| 71 | $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ | ||
| 72 | |||
| 73 | neon2.c: neon.uc ../unroll.awk | ||
| 74 | $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ | ||
| 75 | |||
| 76 | neon4.c: neon.uc ../unroll.awk | ||
| 77 | $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ | ||
| 78 | |||
| 79 | neon8.c: neon.uc ../unroll.awk | ||
| 80 | $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ | ||
| 81 | |||
| 58 | altivec1.c: altivec.uc ../unroll.awk | 82 | altivec1.c: altivec.uc ../unroll.awk |
| 59 | $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ | 83 | $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ |
| 60 | 84 | ||
| @@ -89,7 +113,7 @@ tables.c: mktables | |||
| 89 | ./mktables > tables.c | 113 | ./mktables > tables.c |
| 90 | 114 | ||
| 91 | clean: | 115 | clean: |
| 92 | rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test | 116 | rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test |
| 93 | 117 | ||
| 94 | spotless: clean | 118 | spotless: clean |
| 95 | rm -f *~ | 119 | rm -f *~ |
