diff options
96 files changed, 6391 insertions, 605 deletions
diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index f89515adac60..eb577f4f5f70 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h | |||
@@ -52,15 +52,7 @@ extern inline void *return_address(unsigned int level) | |||
52 | 52 | ||
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #define HAVE_ARCH_CALLER_ADDR | 55 | #define ftrace_return_addr(n) return_address(n) |
56 | |||
57 | #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) | ||
58 | #define CALLER_ADDR1 ((unsigned long)return_address(1)) | ||
59 | #define CALLER_ADDR2 ((unsigned long)return_address(2)) | ||
60 | #define CALLER_ADDR3 ((unsigned long)return_address(3)) | ||
61 | #define CALLER_ADDR4 ((unsigned long)return_address(4)) | ||
62 | #define CALLER_ADDR5 ((unsigned long)return_address(5)) | ||
63 | #define CALLER_ADDR6 ((unsigned long)return_address(6)) | ||
64 | 56 | ||
65 | #endif /* ifndef __ASSEMBLY__ */ | 57 | #endif /* ifndef __ASSEMBLY__ */ |
66 | 58 | ||
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e384ab9b3862..7295419165e1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -30,12 +30,17 @@ config ARM64 | |||
30 | select HAVE_ARCH_JUMP_LABEL | 30 | select HAVE_ARCH_JUMP_LABEL |
31 | select HAVE_ARCH_KGDB | 31 | select HAVE_ARCH_KGDB |
32 | select HAVE_ARCH_TRACEHOOK | 32 | select HAVE_ARCH_TRACEHOOK |
33 | select HAVE_C_RECORDMCOUNT | ||
33 | select HAVE_DEBUG_BUGVERBOSE | 34 | select HAVE_DEBUG_BUGVERBOSE |
34 | select HAVE_DEBUG_KMEMLEAK | 35 | select HAVE_DEBUG_KMEMLEAK |
35 | select HAVE_DMA_API_DEBUG | 36 | select HAVE_DMA_API_DEBUG |
36 | select HAVE_DMA_ATTRS | 37 | select HAVE_DMA_ATTRS |
37 | select HAVE_DMA_CONTIGUOUS | 38 | select HAVE_DMA_CONTIGUOUS |
39 | select HAVE_DYNAMIC_FTRACE | ||
38 | select HAVE_EFFICIENT_UNALIGNED_ACCESS | 40 | select HAVE_EFFICIENT_UNALIGNED_ACCESS |
41 | select HAVE_FTRACE_MCOUNT_RECORD | ||
42 | select HAVE_FUNCTION_TRACER | ||
43 | select HAVE_FUNCTION_GRAPH_TRACER | ||
39 | select HAVE_GENERIC_DMA_COHERENT | 44 | select HAVE_GENERIC_DMA_COHERENT |
40 | select HAVE_HW_BREAKPOINT if PERF_EVENTS | 45 | select HAVE_HW_BREAKPOINT if PERF_EVENTS |
41 | select HAVE_MEMBLOCK | 46 | select HAVE_MEMBLOCK |
@@ -43,6 +48,7 @@ config ARM64 | |||
43 | select HAVE_PERF_EVENTS | 48 | select HAVE_PERF_EVENTS |
44 | select HAVE_PERF_REGS | 49 | select HAVE_PERF_REGS |
45 | select HAVE_PERF_USER_STACK_DUMP | 50 | select HAVE_PERF_USER_STACK_DUMP |
51 | select HAVE_SYSCALL_TRACEPOINTS | ||
46 | select IRQ_DOMAIN | 52 | select IRQ_DOMAIN |
47 | select MODULES_USE_ELF_RELA | 53 | select MODULES_USE_ELF_RELA |
48 | select NO_BOOTMEM | 54 | select NO_BOOTMEM |
@@ -245,6 +251,9 @@ config ARCH_WANT_HUGE_PMD_SHARE | |||
245 | config HAVE_ARCH_TRANSPARENT_HUGEPAGE | 251 | config HAVE_ARCH_TRANSPARENT_HUGEPAGE |
246 | def_bool y | 252 | def_bool y |
247 | 253 | ||
254 | config ARCH_HAS_CACHE_LINE_SIZE | ||
255 | def_bool y | ||
256 | |||
248 | source "mm/Kconfig" | 257 | source "mm/Kconfig" |
249 | 258 | ||
250 | config XEN_DOM0 | 259 | config XEN_DOM0 |
@@ -359,5 +368,8 @@ source "arch/arm64/Kconfig.debug" | |||
359 | source "security/Kconfig" | 368 | source "security/Kconfig" |
360 | 369 | ||
361 | source "crypto/Kconfig" | 370 | source "crypto/Kconfig" |
371 | if CRYPTO | ||
372 | source "arch/arm64/crypto/Kconfig" | ||
373 | endif | ||
362 | 374 | ||
363 | source "lib/Kconfig" | 375 | source "lib/Kconfig" |
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 2fceb71ac3b7..8185a913c5ed 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile | |||
@@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS | |||
45 | core-y += arch/arm64/kernel/ arch/arm64/mm/ | 45 | core-y += arch/arm64/kernel/ arch/arm64/mm/ |
46 | core-$(CONFIG_KVM) += arch/arm64/kvm/ | 46 | core-$(CONFIG_KVM) += arch/arm64/kvm/ |
47 | core-$(CONFIG_XEN) += arch/arm64/xen/ | 47 | core-$(CONFIG_XEN) += arch/arm64/xen/ |
48 | core-$(CONFIG_CRYPTO) += arch/arm64/crypto/ | ||
48 | libs-y := arch/arm64/lib/ $(libs-y) | 49 | libs-y := arch/arm64/lib/ $(libs-y) |
49 | libs-y += $(LIBGCC) | 50 | libs-y += $(LIBGCC) |
50 | 51 | ||
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 7959dd0ca5d5..157e1d8d9a47 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig | |||
@@ -1,11 +1,11 @@ | |||
1 | # CONFIG_LOCALVERSION_AUTO is not set | 1 | # CONFIG_LOCALVERSION_AUTO is not set |
2 | # CONFIG_SWAP is not set | ||
3 | CONFIG_SYSVIPC=y | 2 | CONFIG_SYSVIPC=y |
4 | CONFIG_POSIX_MQUEUE=y | 3 | CONFIG_POSIX_MQUEUE=y |
4 | CONFIG_AUDIT=y | ||
5 | CONFIG_NO_HZ_IDLE=y | ||
6 | CONFIG_HIGH_RES_TIMERS=y | ||
5 | CONFIG_BSD_PROCESS_ACCT=y | 7 | CONFIG_BSD_PROCESS_ACCT=y |
6 | CONFIG_BSD_PROCESS_ACCT_V3=y | 8 | CONFIG_BSD_PROCESS_ACCT_V3=y |
7 | CONFIG_NO_HZ=y | ||
8 | CONFIG_HIGH_RES_TIMERS=y | ||
9 | CONFIG_IKCONFIG=y | 9 | CONFIG_IKCONFIG=y |
10 | CONFIG_IKCONFIG_PROC=y | 10 | CONFIG_IKCONFIG_PROC=y |
11 | CONFIG_LOG_BUF_SHIFT=14 | 11 | CONFIG_LOG_BUF_SHIFT=14 |
@@ -27,6 +27,7 @@ CONFIG_ARCH_VEXPRESS=y | |||
27 | CONFIG_ARCH_XGENE=y | 27 | CONFIG_ARCH_XGENE=y |
28 | CONFIG_SMP=y | 28 | CONFIG_SMP=y |
29 | CONFIG_PREEMPT=y | 29 | CONFIG_PREEMPT=y |
30 | CONFIG_TRANSPARENT_HUGEPAGE=y | ||
30 | CONFIG_CMA=y | 31 | CONFIG_CMA=y |
31 | CONFIG_CMDLINE="console=ttyAMA0" | 32 | CONFIG_CMDLINE="console=ttyAMA0" |
32 | # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set | 33 | # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set |
@@ -44,7 +45,7 @@ CONFIG_IP_PNP_BOOTP=y | |||
44 | CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" | 45 | CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" |
45 | CONFIG_DEVTMPFS=y | 46 | CONFIG_DEVTMPFS=y |
46 | CONFIG_DMA_CMA=y | 47 | CONFIG_DMA_CMA=y |
47 | CONFIG_SCSI=y | 48 | CONFIG_VIRTIO_BLK=y |
48 | # CONFIG_SCSI_PROC_FS is not set | 49 | # CONFIG_SCSI_PROC_FS is not set |
49 | CONFIG_BLK_DEV_SD=y | 50 | CONFIG_BLK_DEV_SD=y |
50 | # CONFIG_SCSI_LOWLEVEL is not set | 51 | # CONFIG_SCSI_LOWLEVEL is not set |
@@ -56,20 +57,18 @@ CONFIG_SMC91X=y | |||
56 | CONFIG_SMSC911X=y | 57 | CONFIG_SMSC911X=y |
57 | # CONFIG_WLAN is not set | 58 | # CONFIG_WLAN is not set |
58 | CONFIG_INPUT_EVDEV=y | 59 | CONFIG_INPUT_EVDEV=y |
59 | # CONFIG_SERIO_I8042 is not set | ||
60 | # CONFIG_SERIO_SERPORT is not set | 60 | # CONFIG_SERIO_SERPORT is not set |
61 | CONFIG_LEGACY_PTY_COUNT=16 | 61 | CONFIG_LEGACY_PTY_COUNT=16 |
62 | CONFIG_SERIAL_8250=y | 62 | CONFIG_SERIAL_8250=y |
63 | CONFIG_SERIAL_8250_CONSOLE=y | 63 | CONFIG_SERIAL_8250_CONSOLE=y |
64 | CONFIG_SERIAL_OF_PLATFORM=y | ||
65 | CONFIG_SERIAL_AMBA_PL011=y | 64 | CONFIG_SERIAL_AMBA_PL011=y |
66 | CONFIG_SERIAL_AMBA_PL011_CONSOLE=y | 65 | CONFIG_SERIAL_AMBA_PL011_CONSOLE=y |
66 | CONFIG_SERIAL_OF_PLATFORM=y | ||
67 | # CONFIG_HW_RANDOM is not set | 67 | # CONFIG_HW_RANDOM is not set |
68 | # CONFIG_HWMON is not set | 68 | # CONFIG_HWMON is not set |
69 | CONFIG_REGULATOR=y | 69 | CONFIG_REGULATOR=y |
70 | CONFIG_REGULATOR_FIXED_VOLTAGE=y | 70 | CONFIG_REGULATOR_FIXED_VOLTAGE=y |
71 | CONFIG_FB=y | 71 | CONFIG_FB=y |
72 | # CONFIG_VGA_CONSOLE is not set | ||
73 | CONFIG_FRAMEBUFFER_CONSOLE=y | 72 | CONFIG_FRAMEBUFFER_CONSOLE=y |
74 | CONFIG_LOGO=y | 73 | CONFIG_LOGO=y |
75 | # CONFIG_LOGO_LINUX_MONO is not set | 74 | # CONFIG_LOGO_LINUX_MONO is not set |
@@ -79,27 +78,38 @@ CONFIG_USB_ISP1760_HCD=y | |||
79 | CONFIG_USB_STORAGE=y | 78 | CONFIG_USB_STORAGE=y |
80 | CONFIG_MMC=y | 79 | CONFIG_MMC=y |
81 | CONFIG_MMC_ARMMMCI=y | 80 | CONFIG_MMC_ARMMMCI=y |
81 | CONFIG_VIRTIO_MMIO=y | ||
82 | # CONFIG_IOMMU_SUPPORT is not set | 82 | # CONFIG_IOMMU_SUPPORT is not set |
83 | CONFIG_EXT2_FS=y | 83 | CONFIG_EXT2_FS=y |
84 | CONFIG_EXT3_FS=y | 84 | CONFIG_EXT3_FS=y |
85 | CONFIG_EXT4_FS=y | ||
86 | # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set | 85 | # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set |
87 | # CONFIG_EXT3_FS_XATTR is not set | 86 | # CONFIG_EXT3_FS_XATTR is not set |
87 | CONFIG_EXT4_FS=y | ||
88 | CONFIG_FUSE_FS=y | 88 | CONFIG_FUSE_FS=y |
89 | CONFIG_CUSE=y | 89 | CONFIG_CUSE=y |
90 | CONFIG_VFAT_FS=y | 90 | CONFIG_VFAT_FS=y |
91 | CONFIG_TMPFS=y | 91 | CONFIG_TMPFS=y |
92 | CONFIG_HUGETLBFS=y | ||
92 | # CONFIG_MISC_FILESYSTEMS is not set | 93 | # CONFIG_MISC_FILESYSTEMS is not set |
93 | CONFIG_NFS_FS=y | 94 | CONFIG_NFS_FS=y |
94 | CONFIG_ROOT_NFS=y | 95 | CONFIG_ROOT_NFS=y |
95 | CONFIG_NLS_CODEPAGE_437=y | 96 | CONFIG_NLS_CODEPAGE_437=y |
96 | CONFIG_NLS_ISO8859_1=y | 97 | CONFIG_NLS_ISO8859_1=y |
97 | CONFIG_MAGIC_SYSRQ=y | 98 | CONFIG_VIRTUALIZATION=y |
99 | CONFIG_KVM=y | ||
100 | CONFIG_DEBUG_INFO=y | ||
98 | CONFIG_DEBUG_FS=y | 101 | CONFIG_DEBUG_FS=y |
102 | CONFIG_MAGIC_SYSRQ=y | ||
99 | CONFIG_DEBUG_KERNEL=y | 103 | CONFIG_DEBUG_KERNEL=y |
104 | CONFIG_LOCKUP_DETECTOR=y | ||
100 | # CONFIG_SCHED_DEBUG is not set | 105 | # CONFIG_SCHED_DEBUG is not set |
101 | CONFIG_DEBUG_INFO=y | ||
102 | # CONFIG_FTRACE is not set | 106 | # CONFIG_FTRACE is not set |
103 | CONFIG_ATOMIC64_SELFTEST=y | 107 | CONFIG_CRYPTO_ANSI_CPRNG=y |
104 | CONFIG_VIRTIO_MMIO=y | 108 | CONFIG_ARM64_CRYPTO=y |
105 | CONFIG_VIRTIO_BLK=y | 109 | CONFIG_CRYPTO_SHA1_ARM64_CE=y |
110 | CONFIG_CRYPTO_SHA2_ARM64_CE=y | ||
111 | CONFIG_CRYPTO_GHASH_ARM64_CE=y | ||
112 | CONFIG_CRYPTO_AES_ARM64_CE=y | ||
113 | CONFIG_CRYPTO_AES_ARM64_CE_CCM=y | ||
114 | CONFIG_CRYPTO_AES_ARM64_CE_BLK=y | ||
115 | CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y | ||
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig new file mode 100644 index 000000000000..5562652c5316 --- /dev/null +++ b/arch/arm64/crypto/Kconfig | |||
@@ -0,0 +1,53 @@ | |||
1 | |||
2 | menuconfig ARM64_CRYPTO | ||
3 | bool "ARM64 Accelerated Cryptographic Algorithms" | ||
4 | depends on ARM64 | ||
5 | help | ||
6 | Say Y here to choose from a selection of cryptographic algorithms | ||
7 | implemented using ARM64 specific CPU features or instructions. | ||
8 | |||
9 | if ARM64_CRYPTO | ||
10 | |||
11 | config CRYPTO_SHA1_ARM64_CE | ||
12 | tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" | ||
13 | depends on ARM64 && KERNEL_MODE_NEON | ||
14 | select CRYPTO_HASH | ||
15 | |||
16 | config CRYPTO_SHA2_ARM64_CE | ||
17 | tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" | ||
18 | depends on ARM64 && KERNEL_MODE_NEON | ||
19 | select CRYPTO_HASH | ||
20 | |||
21 | config CRYPTO_GHASH_ARM64_CE | ||
22 | tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" | ||
23 | depends on ARM64 && KERNEL_MODE_NEON | ||
24 | select CRYPTO_HASH | ||
25 | |||
26 | config CRYPTO_AES_ARM64_CE | ||
27 | tristate "AES core cipher using ARMv8 Crypto Extensions" | ||
28 | depends on ARM64 && KERNEL_MODE_NEON | ||
29 | select CRYPTO_ALGAPI | ||
30 | select CRYPTO_AES | ||
31 | |||
32 | config CRYPTO_AES_ARM64_CE_CCM | ||
33 | tristate "AES in CCM mode using ARMv8 Crypto Extensions" | ||
34 | depends on ARM64 && KERNEL_MODE_NEON | ||
35 | select CRYPTO_ALGAPI | ||
36 | select CRYPTO_AES | ||
37 | select CRYPTO_AEAD | ||
38 | |||
39 | config CRYPTO_AES_ARM64_CE_BLK | ||
40 | tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" | ||
41 | depends on ARM64 && KERNEL_MODE_NEON | ||
42 | select CRYPTO_BLKCIPHER | ||
43 | select CRYPTO_AES | ||
44 | select CRYPTO_ABLK_HELPER | ||
45 | |||
46 | config CRYPTO_AES_ARM64_NEON_BLK | ||
47 | tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" | ||
48 | depends on ARM64 && KERNEL_MODE_NEON | ||
49 | select CRYPTO_BLKCIPHER | ||
50 | select CRYPTO_AES | ||
51 | select CRYPTO_ABLK_HELPER | ||
52 | |||
53 | endif | ||
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile new file mode 100644 index 000000000000..2070a56ecc46 --- /dev/null +++ b/arch/arm64/crypto/Makefile | |||
@@ -0,0 +1,38 @@ | |||
1 | # | ||
2 | # linux/arch/arm64/crypto/Makefile | ||
3 | # | ||
4 | # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | # | ||
6 | # This program is free software; you can redistribute it and/or modify | ||
7 | # it under the terms of the GNU General Public License version 2 as | ||
8 | # published by the Free Software Foundation. | ||
9 | # | ||
10 | |||
11 | obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o | ||
12 | sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o | ||
13 | |||
14 | obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o | ||
15 | sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o | ||
16 | |||
17 | obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o | ||
18 | ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o | ||
19 | |||
20 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o | ||
21 | CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto | ||
22 | |||
23 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o | ||
24 | aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o | ||
25 | |||
26 | obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o | ||
27 | aes-ce-blk-y := aes-glue-ce.o aes-ce.o | ||
28 | |||
29 | obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o | ||
30 | aes-neon-blk-y := aes-glue-neon.o aes-neon.o | ||
31 | |||
32 | AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE | ||
33 | AFLAGS_aes-neon.o := -DINTERLEAVE=4 | ||
34 | |||
35 | CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS | ||
36 | |||
37 | $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE | ||
38 | $(call if_changed_dep,cc_o_c) | ||
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S new file mode 100644 index 000000000000..432e4841cd81 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-core.S | |||
@@ -0,0 +1,222 @@ | |||
1 | /* | ||
2 | * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | |||
13 | .text | ||
14 | .arch armv8-a+crypto | ||
15 | |||
16 | /* | ||
17 | * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, | ||
18 | * u32 *macp, u8 const rk[], u32 rounds); | ||
19 | */ | ||
20 | ENTRY(ce_aes_ccm_auth_data) | ||
21 | ldr w8, [x3] /* leftover from prev round? */ | ||
22 | ld1 {v0.2d}, [x0] /* load mac */ | ||
23 | cbz w8, 1f | ||
24 | sub w8, w8, #16 | ||
25 | eor v1.16b, v1.16b, v1.16b | ||
26 | 0: ldrb w7, [x1], #1 /* get 1 byte of input */ | ||
27 | subs w2, w2, #1 | ||
28 | add w8, w8, #1 | ||
29 | ins v1.b[0], w7 | ||
30 | ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ | ||
31 | beq 8f /* out of input? */ | ||
32 | cbnz w8, 0b | ||
33 | eor v0.16b, v0.16b, v1.16b | ||
34 | 1: ld1 {v3.2d}, [x4] /* load first round key */ | ||
35 | prfm pldl1strm, [x1] | ||
36 | cmp w5, #12 /* which key size? */ | ||
37 | add x6, x4, #16 | ||
38 | sub w7, w5, #2 /* modified # of rounds */ | ||
39 | bmi 2f | ||
40 | bne 5f | ||
41 | mov v5.16b, v3.16b | ||
42 | b 4f | ||
43 | 2: mov v4.16b, v3.16b | ||
44 | ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ | ||
45 | 3: aese v0.16b, v4.16b | ||
46 | aesmc v0.16b, v0.16b | ||
47 | 4: ld1 {v3.2d}, [x6], #16 /* load next round key */ | ||
48 | aese v0.16b, v5.16b | ||
49 | aesmc v0.16b, v0.16b | ||
50 | 5: ld1 {v4.2d}, [x6], #16 /* load next round key */ | ||
51 | subs w7, w7, #3 | ||
52 | aese v0.16b, v3.16b | ||
53 | aesmc v0.16b, v0.16b | ||
54 | ld1 {v5.2d}, [x6], #16 /* load next round key */ | ||
55 | bpl 3b | ||
56 | aese v0.16b, v4.16b | ||
57 | subs w2, w2, #16 /* last data? */ | ||
58 | eor v0.16b, v0.16b, v5.16b /* final round */ | ||
59 | bmi 6f | ||
60 | ld1 {v1.16b}, [x1], #16 /* load next input block */ | ||
61 | eor v0.16b, v0.16b, v1.16b /* xor with mac */ | ||
62 | bne 1b | ||
63 | 6: st1 {v0.2d}, [x0] /* store mac */ | ||
64 | beq 10f | ||
65 | adds w2, w2, #16 | ||
66 | beq 10f | ||
67 | mov w8, w2 | ||
68 | 7: ldrb w7, [x1], #1 | ||
69 | umov w6, v0.b[0] | ||
70 | eor w6, w6, w7 | ||
71 | strb w6, [x0], #1 | ||
72 | subs w2, w2, #1 | ||
73 | beq 10f | ||
74 | ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ | ||
75 | b 7b | ||
76 | 8: mov w7, w8 | ||
77 | add w8, w8, #16 | ||
78 | 9: ext v1.16b, v1.16b, v1.16b, #1 | ||
79 | adds w7, w7, #1 | ||
80 | bne 9b | ||
81 | eor v0.16b, v0.16b, v1.16b | ||
82 | st1 {v0.2d}, [x0] | ||
83 | 10: str w8, [x3] | ||
84 | ret | ||
85 | ENDPROC(ce_aes_ccm_auth_data) | ||
86 | |||
87 | /* | ||
88 | * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], | ||
89 | * u32 rounds); | ||
90 | */ | ||
91 | ENTRY(ce_aes_ccm_final) | ||
92 | ld1 {v3.2d}, [x2], #16 /* load first round key */ | ||
93 | ld1 {v0.2d}, [x0] /* load mac */ | ||
94 | cmp w3, #12 /* which key size? */ | ||
95 | sub w3, w3, #2 /* modified # of rounds */ | ||
96 | ld1 {v1.2d}, [x1] /* load 1st ctriv */ | ||
97 | bmi 0f | ||
98 | bne 3f | ||
99 | mov v5.16b, v3.16b | ||
100 | b 2f | ||
101 | 0: mov v4.16b, v3.16b | ||
102 | 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ | ||
103 | aese v0.16b, v4.16b | ||
104 | aese v1.16b, v4.16b | ||
105 | aesmc v0.16b, v0.16b | ||
106 | aesmc v1.16b, v1.16b | ||
107 | 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ | ||
108 | aese v0.16b, v5.16b | ||
109 | aese v1.16b, v5.16b | ||
110 | aesmc v0.16b, v0.16b | ||
111 | aesmc v1.16b, v1.16b | ||
112 | 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ | ||
113 | subs w3, w3, #3 | ||
114 | aese v0.16b, v3.16b | ||
115 | aese v1.16b, v3.16b | ||
116 | aesmc v0.16b, v0.16b | ||
117 | aesmc v1.16b, v1.16b | ||
118 | bpl 1b | ||
119 | aese v0.16b, v4.16b | ||
120 | aese v1.16b, v4.16b | ||
121 | /* final round key cancels out */ | ||
122 | eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ | ||
123 | st1 {v0.2d}, [x0] /* store result */ | ||
124 | ret | ||
125 | ENDPROC(ce_aes_ccm_final) | ||
126 | |||
127 | .macro aes_ccm_do_crypt,enc | ||
128 | ldr x8, [x6, #8] /* load lower ctr */ | ||
129 | ld1 {v0.2d}, [x5] /* load mac */ | ||
130 | rev x8, x8 /* keep swabbed ctr in reg */ | ||
131 | 0: /* outer loop */ | ||
132 | ld1 {v1.1d}, [x6] /* load upper ctr */ | ||
133 | prfm pldl1strm, [x1] | ||
134 | add x8, x8, #1 | ||
135 | rev x9, x8 | ||
136 | cmp w4, #12 /* which key size? */ | ||
137 | sub w7, w4, #2 /* get modified # of rounds */ | ||
138 | ins v1.d[1], x9 /* no carry in lower ctr */ | ||
139 | ld1 {v3.2d}, [x3] /* load first round key */ | ||
140 | add x10, x3, #16 | ||
141 | bmi 1f | ||
142 | bne 4f | ||
143 | mov v5.16b, v3.16b | ||
144 | b 3f | ||
145 | 1: mov v4.16b, v3.16b | ||
146 | ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ | ||
147 | 2: /* inner loop: 3 rounds, 2x interleaved */ | ||
148 | aese v0.16b, v4.16b | ||
149 | aese v1.16b, v4.16b | ||
150 | aesmc v0.16b, v0.16b | ||
151 | aesmc v1.16b, v1.16b | ||
152 | 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ | ||
153 | aese v0.16b, v5.16b | ||
154 | aese v1.16b, v5.16b | ||
155 | aesmc v0.16b, v0.16b | ||
156 | aesmc v1.16b, v1.16b | ||
157 | 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ | ||
158 | subs w7, w7, #3 | ||
159 | aese v0.16b, v3.16b | ||
160 | aese v1.16b, v3.16b | ||
161 | aesmc v0.16b, v0.16b | ||
162 | aesmc v1.16b, v1.16b | ||
163 | ld1 {v5.2d}, [x10], #16 /* load next round key */ | ||
164 | bpl 2b | ||
165 | aese v0.16b, v4.16b | ||
166 | aese v1.16b, v4.16b | ||
167 | subs w2, w2, #16 | ||
168 | bmi 6f /* partial block? */ | ||
169 | ld1 {v2.16b}, [x1], #16 /* load next input block */ | ||
170 | .if \enc == 1 | ||
171 | eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ | ||
172 | eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ | ||
173 | .else | ||
174 | eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ | ||
175 | eor v1.16b, v2.16b, v5.16b /* final round enc */ | ||
176 | .endif | ||
177 | eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ | ||
178 | st1 {v1.16b}, [x0], #16 /* write output block */ | ||
179 | bne 0b | ||
180 | rev x8, x8 | ||
181 | st1 {v0.2d}, [x5] /* store mac */ | ||
182 | str x8, [x6, #8] /* store lsb end of ctr (BE) */ | ||
183 | 5: ret | ||
184 | |||
185 | 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ | ||
186 | eor v1.16b, v1.16b, v5.16b /* final round enc */ | ||
187 | st1 {v0.2d}, [x5] /* store mac */ | ||
188 | add w2, w2, #16 /* process partial tail block */ | ||
189 | 7: ldrb w9, [x1], #1 /* get 1 byte of input */ | ||
190 | umov w6, v1.b[0] /* get top crypted ctr byte */ | ||
191 | umov w7, v0.b[0] /* get top mac byte */ | ||
192 | .if \enc == 1 | ||
193 | eor w7, w7, w9 | ||
194 | eor w9, w9, w6 | ||
195 | .else | ||
196 | eor w9, w9, w6 | ||
197 | eor w7, w7, w9 | ||
198 | .endif | ||
199 | strb w9, [x0], #1 /* store out byte */ | ||
200 | strb w7, [x5], #1 /* store mac byte */ | ||
201 | subs w2, w2, #1 | ||
202 | beq 5b | ||
203 | ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ | ||
204 | ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ | ||
205 | b 7b | ||
206 | .endm | ||
207 | |||
208 | /* | ||
209 | * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, | ||
210 | * u8 const rk[], u32 rounds, u8 mac[], | ||
211 | * u8 ctr[]); | ||
212 | * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, | ||
213 | * u8 const rk[], u32 rounds, u8 mac[], | ||
214 | * u8 ctr[]); | ||
215 | */ | ||
216 | ENTRY(ce_aes_ccm_encrypt) | ||
217 | aes_ccm_do_crypt 1 | ||
218 | ENDPROC(ce_aes_ccm_encrypt) | ||
219 | |||
220 | ENTRY(ce_aes_ccm_decrypt) | ||
221 | aes_ccm_do_crypt 0 | ||
222 | ENDPROC(ce_aes_ccm_decrypt) | ||
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c new file mode 100644 index 000000000000..9e6cdde9b43d --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c | |||
@@ -0,0 +1,297 @@ | |||
1 | /* | ||
2 | * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/aes.h> | ||
14 | #include <crypto/algapi.h> | ||
15 | #include <crypto/scatterwalk.h> | ||
16 | #include <linux/crypto.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | static int num_rounds(struct crypto_aes_ctx *ctx) | ||
20 | { | ||
21 | /* | ||
22 | * # of rounds specified by AES: | ||
23 | * 128 bit key 10 rounds | ||
24 | * 192 bit key 12 rounds | ||
25 | * 256 bit key 14 rounds | ||
26 | * => n byte key => 6 + (n/4) rounds | ||
27 | */ | ||
28 | return 6 + ctx->key_length / 4; | ||
29 | } | ||
30 | |||
31 | asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, | ||
32 | u32 *macp, u32 const rk[], u32 rounds); | ||
33 | |||
34 | asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, | ||
35 | u32 const rk[], u32 rounds, u8 mac[], | ||
36 | u8 ctr[]); | ||
37 | |||
38 | asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, | ||
39 | u32 const rk[], u32 rounds, u8 mac[], | ||
40 | u8 ctr[]); | ||
41 | |||
42 | asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], | ||
43 | u32 rounds); | ||
44 | |||
45 | static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, | ||
46 | unsigned int key_len) | ||
47 | { | ||
48 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); | ||
49 | int ret; | ||
50 | |||
51 | ret = crypto_aes_expand_key(ctx, in_key, key_len); | ||
52 | if (!ret) | ||
53 | return 0; | ||
54 | |||
55 | tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
56 | return -EINVAL; | ||
57 | } | ||
58 | |||
59 | static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) | ||
60 | { | ||
61 | if ((authsize & 1) || authsize < 4) | ||
62 | return -EINVAL; | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) | ||
67 | { | ||
68 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
69 | __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; | ||
70 | u32 l = req->iv[0] + 1; | ||
71 | |||
72 | /* verify that CCM dimension 'L' is set correctly in the IV */ | ||
73 | if (l < 2 || l > 8) | ||
74 | return -EINVAL; | ||
75 | |||
76 | /* verify that msglen can in fact be represented in L bytes */ | ||
77 | if (l < 4 && msglen >> (8 * l)) | ||
78 | return -EOVERFLOW; | ||
79 | |||
80 | /* | ||
81 | * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi | ||
82 | * uses a u32 type to represent msglen so the top 4 bytes are always 0. | ||
83 | */ | ||
84 | n[0] = 0; | ||
85 | n[1] = cpu_to_be32(msglen); | ||
86 | |||
87 | memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); | ||
88 | |||
89 | /* | ||
90 | * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) | ||
91 | * - bits 0..2 : max # of bytes required to represent msglen, minus 1 | ||
92 | * (already set by caller) | ||
93 | * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) | ||
94 | * - bit 6 : indicates presence of authenticate-only data | ||
95 | */ | ||
96 | maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; | ||
97 | if (req->assoclen) | ||
98 | maciv[0] |= 0x40; | ||
99 | |||
100 | memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) | ||
105 | { | ||
106 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
107 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
108 | struct __packed { __be16 l; __be32 h; u16 len; } ltag; | ||
109 | struct scatter_walk walk; | ||
110 | u32 len = req->assoclen; | ||
111 | u32 macp = 0; | ||
112 | |||
113 | /* prepend the AAD with a length tag */ | ||
114 | if (len < 0xff00) { | ||
115 | ltag.l = cpu_to_be16(len); | ||
116 | ltag.len = 2; | ||
117 | } else { | ||
118 | ltag.l = cpu_to_be16(0xfffe); | ||
119 | put_unaligned_be32(len, <ag.h); | ||
120 | ltag.len = 6; | ||
121 | } | ||
122 | |||
123 | ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, | ||
124 | num_rounds(ctx)); | ||
125 | scatterwalk_start(&walk, req->assoc); | ||
126 | |||
127 | do { | ||
128 | u32 n = scatterwalk_clamp(&walk, len); | ||
129 | u8 *p; | ||
130 | |||
131 | if (!n) { | ||
132 | scatterwalk_start(&walk, sg_next(walk.sg)); | ||
133 | n = scatterwalk_clamp(&walk, len); | ||
134 | } | ||
135 | p = scatterwalk_map(&walk); | ||
136 | ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, | ||
137 | num_rounds(ctx)); | ||
138 | len -= n; | ||
139 | |||
140 | scatterwalk_unmap(p); | ||
141 | scatterwalk_advance(&walk, n); | ||
142 | scatterwalk_done(&walk, 0, len); | ||
143 | } while (len); | ||
144 | } | ||
145 | |||
146 | static int ccm_encrypt(struct aead_request *req) | ||
147 | { | ||
148 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
149 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
150 | struct blkcipher_desc desc = { .info = req->iv }; | ||
151 | struct blkcipher_walk walk; | ||
152 | u8 __aligned(8) mac[AES_BLOCK_SIZE]; | ||
153 | u8 buf[AES_BLOCK_SIZE]; | ||
154 | u32 len = req->cryptlen; | ||
155 | int err; | ||
156 | |||
157 | err = ccm_init_mac(req, mac, len); | ||
158 | if (err) | ||
159 | return err; | ||
160 | |||
161 | kernel_neon_begin_partial(6); | ||
162 | |||
163 | if (req->assoclen) | ||
164 | ccm_calculate_auth_mac(req, mac); | ||
165 | |||
166 | /* preserve the original iv for the final round */ | ||
167 | memcpy(buf, req->iv, AES_BLOCK_SIZE); | ||
168 | |||
169 | blkcipher_walk_init(&walk, req->dst, req->src, len); | ||
170 | err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, | ||
171 | AES_BLOCK_SIZE); | ||
172 | |||
173 | while (walk.nbytes) { | ||
174 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
175 | |||
176 | if (walk.nbytes == len) | ||
177 | tail = 0; | ||
178 | |||
179 | ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
180 | walk.nbytes - tail, ctx->key_enc, | ||
181 | num_rounds(ctx), mac, walk.iv); | ||
182 | |||
183 | len -= walk.nbytes - tail; | ||
184 | err = blkcipher_walk_done(&desc, &walk, tail); | ||
185 | } | ||
186 | if (!err) | ||
187 | ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); | ||
188 | |||
189 | kernel_neon_end(); | ||
190 | |||
191 | if (err) | ||
192 | return err; | ||
193 | |||
194 | /* copy authtag to end of dst */ | ||
195 | scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, | ||
196 | crypto_aead_authsize(aead), 1); | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static int ccm_decrypt(struct aead_request *req) | ||
202 | { | ||
203 | struct crypto_aead *aead = crypto_aead_reqtfm(req); | ||
204 | struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); | ||
205 | unsigned int authsize = crypto_aead_authsize(aead); | ||
206 | struct blkcipher_desc desc = { .info = req->iv }; | ||
207 | struct blkcipher_walk walk; | ||
208 | u8 __aligned(8) mac[AES_BLOCK_SIZE]; | ||
209 | u8 buf[AES_BLOCK_SIZE]; | ||
210 | u32 len = req->cryptlen - authsize; | ||
211 | int err; | ||
212 | |||
213 | err = ccm_init_mac(req, mac, len); | ||
214 | if (err) | ||
215 | return err; | ||
216 | |||
217 | kernel_neon_begin_partial(6); | ||
218 | |||
219 | if (req->assoclen) | ||
220 | ccm_calculate_auth_mac(req, mac); | ||
221 | |||
222 | /* preserve the original iv for the final round */ | ||
223 | memcpy(buf, req->iv, AES_BLOCK_SIZE); | ||
224 | |||
225 | blkcipher_walk_init(&walk, req->dst, req->src, len); | ||
226 | err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, | ||
227 | AES_BLOCK_SIZE); | ||
228 | |||
229 | while (walk.nbytes) { | ||
230 | u32 tail = walk.nbytes % AES_BLOCK_SIZE; | ||
231 | |||
232 | if (walk.nbytes == len) | ||
233 | tail = 0; | ||
234 | |||
235 | ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
236 | walk.nbytes - tail, ctx->key_enc, | ||
237 | num_rounds(ctx), mac, walk.iv); | ||
238 | |||
239 | len -= walk.nbytes - tail; | ||
240 | err = blkcipher_walk_done(&desc, &walk, tail); | ||
241 | } | ||
242 | if (!err) | ||
243 | ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); | ||
244 | |||
245 | kernel_neon_end(); | ||
246 | |||
247 | if (err) | ||
248 | return err; | ||
249 | |||
250 | /* compare calculated auth tag with the stored one */ | ||
251 | scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, | ||
252 | authsize, 0); | ||
253 | |||
254 | if (memcmp(mac, buf, authsize)) | ||
255 | return -EBADMSG; | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static struct crypto_alg ccm_aes_alg = { | ||
260 | .cra_name = "ccm(aes)", | ||
261 | .cra_driver_name = "ccm-aes-ce", | ||
262 | .cra_priority = 300, | ||
263 | .cra_flags = CRYPTO_ALG_TYPE_AEAD, | ||
264 | .cra_blocksize = 1, | ||
265 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
266 | .cra_alignmask = 7, | ||
267 | .cra_type = &crypto_aead_type, | ||
268 | .cra_module = THIS_MODULE, | ||
269 | .cra_aead = { | ||
270 | .ivsize = AES_BLOCK_SIZE, | ||
271 | .maxauthsize = AES_BLOCK_SIZE, | ||
272 | .setkey = ccm_setkey, | ||
273 | .setauthsize = ccm_setauthsize, | ||
274 | .encrypt = ccm_encrypt, | ||
275 | .decrypt = ccm_decrypt, | ||
276 | } | ||
277 | }; | ||
278 | |||
279 | static int __init aes_mod_init(void) | ||
280 | { | ||
281 | if (!(elf_hwcap & HWCAP_AES)) | ||
282 | return -ENODEV; | ||
283 | return crypto_register_alg(&ccm_aes_alg); | ||
284 | } | ||
285 | |||
286 | static void __exit aes_mod_exit(void) | ||
287 | { | ||
288 | crypto_unregister_alg(&ccm_aes_alg); | ||
289 | } | ||
290 | |||
291 | module_init(aes_mod_init); | ||
292 | module_exit(aes_mod_exit); | ||
293 | |||
294 | MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); | ||
295 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
296 | MODULE_LICENSE("GPL v2"); | ||
297 | MODULE_ALIAS("ccm(aes)"); | ||
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c new file mode 100644 index 000000000000..2075e1acae6b --- /dev/null +++ b/arch/arm64/crypto/aes-ce-cipher.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <crypto/aes.h> | ||
13 | #include <linux/cpufeature.h> | ||
14 | #include <linux/crypto.h> | ||
15 | #include <linux/module.h> | ||
16 | |||
17 | MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); | ||
18 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
19 | MODULE_LICENSE("GPL v2"); | ||
20 | |||
21 | struct aes_block { | ||
22 | u8 b[AES_BLOCK_SIZE]; | ||
23 | }; | ||
24 | |||
25 | static int num_rounds(struct crypto_aes_ctx *ctx) | ||
26 | { | ||
27 | /* | ||
28 | * # of rounds specified by AES: | ||
29 | * 128 bit key 10 rounds | ||
30 | * 192 bit key 12 rounds | ||
31 | * 256 bit key 14 rounds | ||
32 | * => n byte key => 6 + (n/4) rounds | ||
33 | */ | ||
34 | return 6 + ctx->key_length / 4; | ||
35 | } | ||
36 | |||
37 | static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) | ||
38 | { | ||
39 | struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
40 | struct aes_block *out = (struct aes_block *)dst; | ||
41 | struct aes_block const *in = (struct aes_block *)src; | ||
42 | void *dummy0; | ||
43 | int dummy1; | ||
44 | |||
45 | kernel_neon_begin_partial(4); | ||
46 | |||
47 | __asm__(" ld1 {v0.16b}, %[in] ;" | ||
48 | " ld1 {v1.2d}, [%[key]], #16 ;" | ||
49 | " cmp %w[rounds], #10 ;" | ||
50 | " bmi 0f ;" | ||
51 | " bne 3f ;" | ||
52 | " mov v3.16b, v1.16b ;" | ||
53 | " b 2f ;" | ||
54 | "0: mov v2.16b, v1.16b ;" | ||
55 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
56 | "1: aese v0.16b, v2.16b ;" | ||
57 | " aesmc v0.16b, v0.16b ;" | ||
58 | "2: ld1 {v1.2d}, [%[key]], #16 ;" | ||
59 | " aese v0.16b, v3.16b ;" | ||
60 | " aesmc v0.16b, v0.16b ;" | ||
61 | "3: ld1 {v2.2d}, [%[key]], #16 ;" | ||
62 | " subs %w[rounds], %w[rounds], #3 ;" | ||
63 | " aese v0.16b, v1.16b ;" | ||
64 | " aesmc v0.16b, v0.16b ;" | ||
65 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
66 | " bpl 1b ;" | ||
67 | " aese v0.16b, v2.16b ;" | ||
68 | " eor v0.16b, v0.16b, v3.16b ;" | ||
69 | " st1 {v0.16b}, %[out] ;" | ||
70 | |||
71 | : [out] "=Q"(*out), | ||
72 | [key] "=r"(dummy0), | ||
73 | [rounds] "=r"(dummy1) | ||
74 | : [in] "Q"(*in), | ||
75 | "1"(ctx->key_enc), | ||
76 | "2"(num_rounds(ctx) - 2) | ||
77 | : "cc"); | ||
78 | |||
79 | kernel_neon_end(); | ||
80 | } | ||
81 | |||
82 | static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) | ||
83 | { | ||
84 | struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); | ||
85 | struct aes_block *out = (struct aes_block *)dst; | ||
86 | struct aes_block const *in = (struct aes_block *)src; | ||
87 | void *dummy0; | ||
88 | int dummy1; | ||
89 | |||
90 | kernel_neon_begin_partial(4); | ||
91 | |||
92 | __asm__(" ld1 {v0.16b}, %[in] ;" | ||
93 | " ld1 {v1.2d}, [%[key]], #16 ;" | ||
94 | " cmp %w[rounds], #10 ;" | ||
95 | " bmi 0f ;" | ||
96 | " bne 3f ;" | ||
97 | " mov v3.16b, v1.16b ;" | ||
98 | " b 2f ;" | ||
99 | "0: mov v2.16b, v1.16b ;" | ||
100 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
101 | "1: aesd v0.16b, v2.16b ;" | ||
102 | " aesimc v0.16b, v0.16b ;" | ||
103 | "2: ld1 {v1.2d}, [%[key]], #16 ;" | ||
104 | " aesd v0.16b, v3.16b ;" | ||
105 | " aesimc v0.16b, v0.16b ;" | ||
106 | "3: ld1 {v2.2d}, [%[key]], #16 ;" | ||
107 | " subs %w[rounds], %w[rounds], #3 ;" | ||
108 | " aesd v0.16b, v1.16b ;" | ||
109 | " aesimc v0.16b, v0.16b ;" | ||
110 | " ld1 {v3.2d}, [%[key]], #16 ;" | ||
111 | " bpl 1b ;" | ||
112 | " aesd v0.16b, v2.16b ;" | ||
113 | " eor v0.16b, v0.16b, v3.16b ;" | ||
114 | " st1 {v0.16b}, %[out] ;" | ||
115 | |||
116 | : [out] "=Q"(*out), | ||
117 | [key] "=r"(dummy0), | ||
118 | [rounds] "=r"(dummy1) | ||
119 | : [in] "Q"(*in), | ||
120 | "1"(ctx->key_dec), | ||
121 | "2"(num_rounds(ctx) - 2) | ||
122 | : "cc"); | ||
123 | |||
124 | kernel_neon_end(); | ||
125 | } | ||
126 | |||
127 | static struct crypto_alg aes_alg = { | ||
128 | .cra_name = "aes", | ||
129 | .cra_driver_name = "aes-ce", | ||
130 | .cra_priority = 300, | ||
131 | .cra_flags = CRYPTO_ALG_TYPE_CIPHER, | ||
132 | .cra_blocksize = AES_BLOCK_SIZE, | ||
133 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
134 | .cra_module = THIS_MODULE, | ||
135 | .cra_cipher = { | ||
136 | .cia_min_keysize = AES_MIN_KEY_SIZE, | ||
137 | .cia_max_keysize = AES_MAX_KEY_SIZE, | ||
138 | .cia_setkey = crypto_aes_set_key, | ||
139 | .cia_encrypt = aes_cipher_encrypt, | ||
140 | .cia_decrypt = aes_cipher_decrypt | ||
141 | } | ||
142 | }; | ||
143 | |||
144 | static int __init aes_mod_init(void) | ||
145 | { | ||
146 | return crypto_register_alg(&aes_alg); | ||
147 | } | ||
148 | |||
149 | static void __exit aes_mod_exit(void) | ||
150 | { | ||
151 | crypto_unregister_alg(&aes_alg); | ||
152 | } | ||
153 | |||
154 | module_cpu_feature_match(AES, aes_mod_init); | ||
155 | module_exit(aes_mod_exit); | ||
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S new file mode 100644 index 000000000000..685a18f731eb --- /dev/null +++ b/arch/arm64/crypto/aes-ce.S | |||
@@ -0,0 +1,133 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with | ||
3 | * Crypto Extensions | ||
4 | * | ||
5 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | |||
14 | #define AES_ENTRY(func) ENTRY(ce_ ## func) | ||
15 | #define AES_ENDPROC(func) ENDPROC(ce_ ## func) | ||
16 | |||
17 | .arch armv8-a+crypto | ||
18 | |||
19 | /* preload all round keys */ | ||
20 | .macro load_round_keys, rounds, rk | ||
21 | cmp \rounds, #12 | ||
22 | blo 2222f /* 128 bits */ | ||
23 | beq 1111f /* 192 bits */ | ||
24 | ld1 {v17.16b-v18.16b}, [\rk], #32 | ||
25 | 1111: ld1 {v19.16b-v20.16b}, [\rk], #32 | ||
26 | 2222: ld1 {v21.16b-v24.16b}, [\rk], #64 | ||
27 | ld1 {v25.16b-v28.16b}, [\rk], #64 | ||
28 | ld1 {v29.16b-v31.16b}, [\rk] | ||
29 | .endm | ||
30 | |||
31 | /* prepare for encryption with key in rk[] */ | ||
32 | .macro enc_prepare, rounds, rk, ignore | ||
33 | load_round_keys \rounds, \rk | ||
34 | .endm | ||
35 | |||
36 | /* prepare for encryption (again) but with new key in rk[] */ | ||
37 | .macro enc_switch_key, rounds, rk, ignore | ||
38 | load_round_keys \rounds, \rk | ||
39 | .endm | ||
40 | |||
41 | /* prepare for decryption with key in rk[] */ | ||
42 | .macro dec_prepare, rounds, rk, ignore | ||
43 | load_round_keys \rounds, \rk | ||
44 | .endm | ||
45 | |||
46 | .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 | ||
47 | aes\de \i0\().16b, \k\().16b | ||
48 | .ifnb \i1 | ||
49 | aes\de \i1\().16b, \k\().16b | ||
50 | .ifnb \i3 | ||
51 | aes\de \i2\().16b, \k\().16b | ||
52 | aes\de \i3\().16b, \k\().16b | ||
53 | .endif | ||
54 | .endif | ||
55 | aes\mc \i0\().16b, \i0\().16b | ||
56 | .ifnb \i1 | ||
57 | aes\mc \i1\().16b, \i1\().16b | ||
58 | .ifnb \i3 | ||
59 | aes\mc \i2\().16b, \i2\().16b | ||
60 | aes\mc \i3\().16b, \i3\().16b | ||
61 | .endif | ||
62 | .endif | ||
63 | .endm | ||
64 | |||
65 | /* up to 4 interleaved encryption rounds with the same round key */ | ||
66 | .macro round_Nx, enc, k, i0, i1, i2, i3 | ||
67 | .ifc \enc, e | ||
68 | do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 | ||
69 | .else | ||
70 | do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 | ||
71 | .endif | ||
72 | .endm | ||
73 | |||
74 | /* up to 4 interleaved final rounds */ | ||
75 | .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 | ||
76 | aes\de \i0\().16b, \k\().16b | ||
77 | .ifnb \i1 | ||
78 | aes\de \i1\().16b, \k\().16b | ||
79 | .ifnb \i3 | ||
80 | aes\de \i2\().16b, \k\().16b | ||
81 | aes\de \i3\().16b, \k\().16b | ||
82 | .endif | ||
83 | .endif | ||
84 | eor \i0\().16b, \i0\().16b, \k2\().16b | ||
85 | .ifnb \i1 | ||
86 | eor \i1\().16b, \i1\().16b, \k2\().16b | ||
87 | .ifnb \i3 | ||
88 | eor \i2\().16b, \i2\().16b, \k2\().16b | ||
89 | eor \i3\().16b, \i3\().16b, \k2\().16b | ||
90 | .endif | ||
91 | .endif | ||
92 | .endm | ||
93 | |||
94 | /* up to 4 interleaved blocks */ | ||
95 | .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 | ||
96 | cmp \rounds, #12 | ||
97 | blo 2222f /* 128 bits */ | ||
98 | beq 1111f /* 192 bits */ | ||
99 | round_Nx \enc, v17, \i0, \i1, \i2, \i3 | ||
100 | round_Nx \enc, v18, \i0, \i1, \i2, \i3 | ||
101 | 1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 | ||
102 | round_Nx \enc, v20, \i0, \i1, \i2, \i3 | ||
103 | 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 | ||
104 | round_Nx \enc, \key, \i0, \i1, \i2, \i3 | ||
105 | .endr | ||
106 | fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 | ||
107 | .endm | ||
108 | |||
109 | .macro encrypt_block, in, rounds, t0, t1, t2 | ||
110 | do_block_Nx e, \rounds, \in | ||
111 | .endm | ||
112 | |||
113 | .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 | ||
114 | do_block_Nx e, \rounds, \i0, \i1 | ||
115 | .endm | ||
116 | |||
117 | .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 | ||
118 | do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 | ||
119 | .endm | ||
120 | |||
121 | .macro decrypt_block, in, rounds, t0, t1, t2 | ||
122 | do_block_Nx d, \rounds, \in | ||
123 | .endm | ||
124 | |||
125 | .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 | ||
126 | do_block_Nx d, \rounds, \i0, \i1 | ||
127 | .endm | ||
128 | |||
129 | .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 | ||
130 | do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 | ||
131 | .endm | ||
132 | |||
133 | #include "aes-modes.S" | ||
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c new file mode 100644 index 000000000000..60f2f4c12256 --- /dev/null +++ b/arch/arm64/crypto/aes-glue.c | |||
@@ -0,0 +1,446 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/hwcap.h> | ||
13 | #include <crypto/aes.h> | ||
14 | #include <crypto/ablk_helper.h> | ||
15 | #include <crypto/algapi.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/cpufeature.h> | ||
18 | |||
19 | #ifdef USE_V8_CRYPTO_EXTENSIONS | ||
20 | #define MODE "ce" | ||
21 | #define PRIO 300 | ||
22 | #define aes_ecb_encrypt ce_aes_ecb_encrypt | ||
23 | #define aes_ecb_decrypt ce_aes_ecb_decrypt | ||
24 | #define aes_cbc_encrypt ce_aes_cbc_encrypt | ||
25 | #define aes_cbc_decrypt ce_aes_cbc_decrypt | ||
26 | #define aes_ctr_encrypt ce_aes_ctr_encrypt | ||
27 | #define aes_xts_encrypt ce_aes_xts_encrypt | ||
28 | #define aes_xts_decrypt ce_aes_xts_decrypt | ||
29 | MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); | ||
30 | #else | ||
31 | #define MODE "neon" | ||
32 | #define PRIO 200 | ||
33 | #define aes_ecb_encrypt neon_aes_ecb_encrypt | ||
34 | #define aes_ecb_decrypt neon_aes_ecb_decrypt | ||
35 | #define aes_cbc_encrypt neon_aes_cbc_encrypt | ||
36 | #define aes_cbc_decrypt neon_aes_cbc_decrypt | ||
37 | #define aes_ctr_encrypt neon_aes_ctr_encrypt | ||
38 | #define aes_xts_encrypt neon_aes_xts_encrypt | ||
39 | #define aes_xts_decrypt neon_aes_xts_decrypt | ||
40 | MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); | ||
41 | MODULE_ALIAS("ecb(aes)"); | ||
42 | MODULE_ALIAS("cbc(aes)"); | ||
43 | MODULE_ALIAS("ctr(aes)"); | ||
44 | MODULE_ALIAS("xts(aes)"); | ||
45 | #endif | ||
46 | |||
47 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
48 | MODULE_LICENSE("GPL v2"); | ||
49 | |||
50 | /* defined in aes-modes.S */ | ||
51 | asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
52 | int rounds, int blocks, int first); | ||
53 | asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], | ||
54 | int rounds, int blocks, int first); | ||
55 | |||
56 | asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
57 | int rounds, int blocks, u8 iv[], int first); | ||
58 | asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], | ||
59 | int rounds, int blocks, u8 iv[], int first); | ||
60 | |||
61 | asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], | ||
62 | int rounds, int blocks, u8 ctr[], int first); | ||
63 | |||
64 | asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], | ||
65 | int rounds, int blocks, u8 const rk2[], u8 iv[], | ||
66 | int first); | ||
67 | asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], | ||
68 | int rounds, int blocks, u8 const rk2[], u8 iv[], | ||
69 | int first); | ||
70 | |||
71 | struct crypto_aes_xts_ctx { | ||
72 | struct crypto_aes_ctx key1; | ||
73 | struct crypto_aes_ctx __aligned(8) key2; | ||
74 | }; | ||
75 | |||
76 | static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, | ||
77 | unsigned int key_len) | ||
78 | { | ||
79 | struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); | ||
80 | int ret; | ||
81 | |||
82 | ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); | ||
83 | if (!ret) | ||
84 | ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], | ||
85 | key_len / 2); | ||
86 | if (!ret) | ||
87 | return 0; | ||
88 | |||
89 | tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | ||
90 | return -EINVAL; | ||
91 | } | ||
92 | |||
93 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
94 | struct scatterlist *src, unsigned int nbytes) | ||
95 | { | ||
96 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
97 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
98 | struct blkcipher_walk walk; | ||
99 | unsigned int blocks; | ||
100 | |||
101 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
102 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
103 | err = blkcipher_walk_virt(desc, &walk); | ||
104 | |||
105 | kernel_neon_begin(); | ||
106 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
107 | aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
108 | (u8 *)ctx->key_enc, rounds, blocks, first); | ||
109 | err = blkcipher_walk_done(desc, &walk, 0); | ||
110 | } | ||
111 | kernel_neon_end(); | ||
112 | return err; | ||
113 | } | ||
114 | |||
115 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
116 | struct scatterlist *src, unsigned int nbytes) | ||
117 | { | ||
118 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
119 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
120 | struct blkcipher_walk walk; | ||
121 | unsigned int blocks; | ||
122 | |||
123 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
124 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
125 | err = blkcipher_walk_virt(desc, &walk); | ||
126 | |||
127 | kernel_neon_begin(); | ||
128 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
129 | aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
130 | (u8 *)ctx->key_dec, rounds, blocks, first); | ||
131 | err = blkcipher_walk_done(desc, &walk, 0); | ||
132 | } | ||
133 | kernel_neon_end(); | ||
134 | return err; | ||
135 | } | ||
136 | |||
137 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
138 | struct scatterlist *src, unsigned int nbytes) | ||
139 | { | ||
140 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
141 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
142 | struct blkcipher_walk walk; | ||
143 | unsigned int blocks; | ||
144 | |||
145 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
146 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
147 | err = blkcipher_walk_virt(desc, &walk); | ||
148 | |||
149 | kernel_neon_begin(); | ||
150 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
151 | aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
152 | (u8 *)ctx->key_enc, rounds, blocks, walk.iv, | ||
153 | first); | ||
154 | err = blkcipher_walk_done(desc, &walk, 0); | ||
155 | } | ||
156 | kernel_neon_end(); | ||
157 | return err; | ||
158 | } | ||
159 | |||
160 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
161 | struct scatterlist *src, unsigned int nbytes) | ||
162 | { | ||
163 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
164 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
165 | struct blkcipher_walk walk; | ||
166 | unsigned int blocks; | ||
167 | |||
168 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
169 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
170 | err = blkcipher_walk_virt(desc, &walk); | ||
171 | |||
172 | kernel_neon_begin(); | ||
173 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
174 | aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
175 | (u8 *)ctx->key_dec, rounds, blocks, walk.iv, | ||
176 | first); | ||
177 | err = blkcipher_walk_done(desc, &walk, 0); | ||
178 | } | ||
179 | kernel_neon_end(); | ||
180 | return err; | ||
181 | } | ||
182 | |||
183 | static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
184 | struct scatterlist *src, unsigned int nbytes) | ||
185 | { | ||
186 | struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
187 | int err, first, rounds = 6 + ctx->key_length / 4; | ||
188 | struct blkcipher_walk walk; | ||
189 | int blocks; | ||
190 | |||
191 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
192 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
193 | err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); | ||
194 | |||
195 | first = 1; | ||
196 | kernel_neon_begin(); | ||
197 | while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { | ||
198 | aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
199 | (u8 *)ctx->key_enc, rounds, blocks, walk.iv, | ||
200 | first); | ||
201 | first = 0; | ||
202 | nbytes -= blocks * AES_BLOCK_SIZE; | ||
203 | if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) | ||
204 | break; | ||
205 | err = blkcipher_walk_done(desc, &walk, | ||
206 | walk.nbytes % AES_BLOCK_SIZE); | ||
207 | } | ||
208 | if (nbytes) { | ||
209 | u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; | ||
210 | u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; | ||
211 | u8 __aligned(8) tail[AES_BLOCK_SIZE]; | ||
212 | |||
213 | /* | ||
214 | * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need | ||
215 | * to tell aes_ctr_encrypt() to only read half a block. | ||
216 | */ | ||
217 | blocks = (nbytes <= 8) ? -1 : 1; | ||
218 | |||
219 | aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, | ||
220 | blocks, walk.iv, first); | ||
221 | memcpy(tdst, tail, nbytes); | ||
222 | err = blkcipher_walk_done(desc, &walk, 0); | ||
223 | } | ||
224 | kernel_neon_end(); | ||
225 | |||
226 | return err; | ||
227 | } | ||
228 | |||
229 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
230 | struct scatterlist *src, unsigned int nbytes) | ||
231 | { | ||
232 | struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
233 | int err, first, rounds = 6 + ctx->key1.key_length / 4; | ||
234 | struct blkcipher_walk walk; | ||
235 | unsigned int blocks; | ||
236 | |||
237 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
238 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
239 | err = blkcipher_walk_virt(desc, &walk); | ||
240 | |||
241 | kernel_neon_begin(); | ||
242 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
243 | aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
244 | (u8 *)ctx->key1.key_enc, rounds, blocks, | ||
245 | (u8 *)ctx->key2.key_enc, walk.iv, first); | ||
246 | err = blkcipher_walk_done(desc, &walk, 0); | ||
247 | } | ||
248 | kernel_neon_end(); | ||
249 | |||
250 | return err; | ||
251 | } | ||
252 | |||
253 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
254 | struct scatterlist *src, unsigned int nbytes) | ||
255 | { | ||
256 | struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
257 | int err, first, rounds = 6 + ctx->key1.key_length / 4; | ||
258 | struct blkcipher_walk walk; | ||
259 | unsigned int blocks; | ||
260 | |||
261 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
262 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
263 | err = blkcipher_walk_virt(desc, &walk); | ||
264 | |||
265 | kernel_neon_begin(); | ||
266 | for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { | ||
267 | aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, | ||
268 | (u8 *)ctx->key1.key_dec, rounds, blocks, | ||
269 | (u8 *)ctx->key2.key_enc, walk.iv, first); | ||
270 | err = blkcipher_walk_done(desc, &walk, 0); | ||
271 | } | ||
272 | kernel_neon_end(); | ||
273 | |||
274 | return err; | ||
275 | } | ||
276 | |||
277 | static struct crypto_alg aes_algs[] = { { | ||
278 | .cra_name = "__ecb-aes-" MODE, | ||
279 | .cra_driver_name = "__driver-ecb-aes-" MODE, | ||
280 | .cra_priority = 0, | ||
281 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
282 | .cra_blocksize = AES_BLOCK_SIZE, | ||
283 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
284 | .cra_alignmask = 7, | ||
285 | .cra_type = &crypto_blkcipher_type, | ||
286 | .cra_module = THIS_MODULE, | ||
287 | .cra_blkcipher = { | ||
288 | .min_keysize = AES_MIN_KEY_SIZE, | ||
289 | .max_keysize = AES_MAX_KEY_SIZE, | ||
290 | .ivsize = AES_BLOCK_SIZE, | ||
291 | .setkey = crypto_aes_set_key, | ||
292 | .encrypt = ecb_encrypt, | ||
293 | .decrypt = ecb_decrypt, | ||
294 | }, | ||
295 | }, { | ||
296 | .cra_name = "__cbc-aes-" MODE, | ||
297 | .cra_driver_name = "__driver-cbc-aes-" MODE, | ||
298 | .cra_priority = 0, | ||
299 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
300 | .cra_blocksize = AES_BLOCK_SIZE, | ||
301 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
302 | .cra_alignmask = 7, | ||
303 | .cra_type = &crypto_blkcipher_type, | ||
304 | .cra_module = THIS_MODULE, | ||
305 | .cra_blkcipher = { | ||
306 | .min_keysize = AES_MIN_KEY_SIZE, | ||
307 | .max_keysize = AES_MAX_KEY_SIZE, | ||
308 | .ivsize = AES_BLOCK_SIZE, | ||
309 | .setkey = crypto_aes_set_key, | ||
310 | .encrypt = cbc_encrypt, | ||
311 | .decrypt = cbc_decrypt, | ||
312 | }, | ||
313 | }, { | ||
314 | .cra_name = "__ctr-aes-" MODE, | ||
315 | .cra_driver_name = "__driver-ctr-aes-" MODE, | ||
316 | .cra_priority = 0, | ||
317 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
318 | .cra_blocksize = 1, | ||
319 | .cra_ctxsize = sizeof(struct crypto_aes_ctx), | ||
320 | .cra_alignmask = 7, | ||
321 | .cra_type = &crypto_blkcipher_type, | ||
322 | .cra_module = THIS_MODULE, | ||
323 | .cra_blkcipher = { | ||
324 | .min_keysize = AES_MIN_KEY_SIZE, | ||
325 | .max_keysize = AES_MAX_KEY_SIZE, | ||
326 | .ivsize = AES_BLOCK_SIZE, | ||
327 | .setkey = crypto_aes_set_key, | ||
328 | .encrypt = ctr_encrypt, | ||
329 | .decrypt = ctr_encrypt, | ||
330 | }, | ||
331 | }, { | ||
332 | .cra_name = "__xts-aes-" MODE, | ||
333 | .cra_driver_name = "__driver-xts-aes-" MODE, | ||
334 | .cra_priority = 0, | ||
335 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
336 | .cra_blocksize = AES_BLOCK_SIZE, | ||
337 | .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), | ||
338 | .cra_alignmask = 7, | ||
339 | .cra_type = &crypto_blkcipher_type, | ||
340 | .cra_module = THIS_MODULE, | ||
341 | .cra_blkcipher = { | ||
342 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
343 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
344 | .ivsize = AES_BLOCK_SIZE, | ||
345 | .setkey = xts_set_key, | ||
346 | .encrypt = xts_encrypt, | ||
347 | .decrypt = xts_decrypt, | ||
348 | }, | ||
349 | }, { | ||
350 | .cra_name = "ecb(aes)", | ||
351 | .cra_driver_name = "ecb-aes-" MODE, | ||
352 | .cra_priority = PRIO, | ||
353 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
354 | .cra_blocksize = AES_BLOCK_SIZE, | ||
355 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
356 | .cra_alignmask = 7, | ||
357 | .cra_type = &crypto_ablkcipher_type, | ||
358 | .cra_module = THIS_MODULE, | ||
359 | .cra_init = ablk_init, | ||
360 | .cra_exit = ablk_exit, | ||
361 | .cra_ablkcipher = { | ||
362 | .min_keysize = AES_MIN_KEY_SIZE, | ||
363 | .max_keysize = AES_MAX_KEY_SIZE, | ||
364 | .ivsize = AES_BLOCK_SIZE, | ||
365 | .setkey = ablk_set_key, | ||
366 | .encrypt = ablk_encrypt, | ||
367 | .decrypt = ablk_decrypt, | ||
368 | } | ||
369 | }, { | ||
370 | .cra_name = "cbc(aes)", | ||
371 | .cra_driver_name = "cbc-aes-" MODE, | ||
372 | .cra_priority = PRIO, | ||
373 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
374 | .cra_blocksize = AES_BLOCK_SIZE, | ||
375 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
376 | .cra_alignmask = 7, | ||
377 | .cra_type = &crypto_ablkcipher_type, | ||
378 | .cra_module = THIS_MODULE, | ||
379 | .cra_init = ablk_init, | ||
380 | .cra_exit = ablk_exit, | ||
381 | .cra_ablkcipher = { | ||
382 | .min_keysize = AES_MIN_KEY_SIZE, | ||
383 | .max_keysize = AES_MAX_KEY_SIZE, | ||
384 | .ivsize = AES_BLOCK_SIZE, | ||
385 | .setkey = ablk_set_key, | ||
386 | .encrypt = ablk_encrypt, | ||
387 | .decrypt = ablk_decrypt, | ||
388 | } | ||
389 | }, { | ||
390 | .cra_name = "ctr(aes)", | ||
391 | .cra_driver_name = "ctr-aes-" MODE, | ||
392 | .cra_priority = PRIO, | ||
393 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
394 | .cra_blocksize = 1, | ||
395 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
396 | .cra_alignmask = 7, | ||
397 | .cra_type = &crypto_ablkcipher_type, | ||
398 | .cra_module = THIS_MODULE, | ||
399 | .cra_init = ablk_init, | ||
400 | .cra_exit = ablk_exit, | ||
401 | .cra_ablkcipher = { | ||
402 | .min_keysize = AES_MIN_KEY_SIZE, | ||
403 | .max_keysize = AES_MAX_KEY_SIZE, | ||
404 | .ivsize = AES_BLOCK_SIZE, | ||
405 | .setkey = ablk_set_key, | ||
406 | .encrypt = ablk_encrypt, | ||
407 | .decrypt = ablk_decrypt, | ||
408 | } | ||
409 | }, { | ||
410 | .cra_name = "xts(aes)", | ||
411 | .cra_driver_name = "xts-aes-" MODE, | ||
412 | .cra_priority = PRIO, | ||
413 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, | ||
414 | .cra_blocksize = AES_BLOCK_SIZE, | ||
415 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
416 | .cra_alignmask = 7, | ||
417 | .cra_type = &crypto_ablkcipher_type, | ||
418 | .cra_module = THIS_MODULE, | ||
419 | .cra_init = ablk_init, | ||
420 | .cra_exit = ablk_exit, | ||
421 | .cra_ablkcipher = { | ||
422 | .min_keysize = 2 * AES_MIN_KEY_SIZE, | ||
423 | .max_keysize = 2 * AES_MAX_KEY_SIZE, | ||
424 | .ivsize = AES_BLOCK_SIZE, | ||
425 | .setkey = ablk_set_key, | ||
426 | .encrypt = ablk_encrypt, | ||
427 | .decrypt = ablk_decrypt, | ||
428 | } | ||
429 | } }; | ||
430 | |||
431 | static int __init aes_init(void) | ||
432 | { | ||
433 | return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); | ||
434 | } | ||
435 | |||
436 | static void __exit aes_exit(void) | ||
437 | { | ||
438 | crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); | ||
439 | } | ||
440 | |||
441 | #ifdef USE_V8_CRYPTO_EXTENSIONS | ||
442 | module_cpu_feature_match(AES, aes_init); | ||
443 | #else | ||
444 | module_init(aes_init); | ||
445 | #endif | ||
446 | module_exit(aes_exit); | ||
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S new file mode 100644 index 000000000000..f6e372c528eb --- /dev/null +++ b/arch/arm64/crypto/aes-modes.S | |||
@@ -0,0 +1,532 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | /* included by aes-ce.S and aes-neon.S */ | ||
12 | |||
13 | .text | ||
14 | .align 4 | ||
15 | |||
16 | /* | ||
17 | * There are several ways to instantiate this code: | ||
18 | * - no interleave, all inline | ||
19 | * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) | ||
20 | * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) | ||
21 | * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) | ||
22 | * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) | ||
23 | * | ||
24 | * Macros imported by this code: | ||
25 | * - enc_prepare - setup NEON registers for encryption | ||
26 | * - dec_prepare - setup NEON registers for decryption | ||
27 | * - enc_switch_key - change to new key after having prepared for encryption | ||
28 | * - encrypt_block - encrypt a single block | ||
29 | * - decrypt block - decrypt a single block | ||
30 | * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) | ||
31 | * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) | ||
32 | * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) | ||
33 | * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) | ||
34 | */ | ||
35 | |||
36 | #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) | ||
37 | #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp | ||
38 | #define FRAME_POP ldp x29, x30, [sp],#16 | ||
39 | |||
40 | #if INTERLEAVE == 2 | ||
41 | |||
42 | aes_encrypt_block2x: | ||
43 | encrypt_block2x v0, v1, w3, x2, x6, w7 | ||
44 | ret | ||
45 | ENDPROC(aes_encrypt_block2x) | ||
46 | |||
47 | aes_decrypt_block2x: | ||
48 | decrypt_block2x v0, v1, w3, x2, x6, w7 | ||
49 | ret | ||
50 | ENDPROC(aes_decrypt_block2x) | ||
51 | |||
52 | #elif INTERLEAVE == 4 | ||
53 | |||
54 | aes_encrypt_block4x: | ||
55 | encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
56 | ret | ||
57 | ENDPROC(aes_encrypt_block4x) | ||
58 | |||
59 | aes_decrypt_block4x: | ||
60 | decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
61 | ret | ||
62 | ENDPROC(aes_decrypt_block4x) | ||
63 | |||
64 | #else | ||
65 | #error INTERLEAVE should equal 2 or 4 | ||
66 | #endif | ||
67 | |||
68 | .macro do_encrypt_block2x | ||
69 | bl aes_encrypt_block2x | ||
70 | .endm | ||
71 | |||
72 | .macro do_decrypt_block2x | ||
73 | bl aes_decrypt_block2x | ||
74 | .endm | ||
75 | |||
76 | .macro do_encrypt_block4x | ||
77 | bl aes_encrypt_block4x | ||
78 | .endm | ||
79 | |||
80 | .macro do_decrypt_block4x | ||
81 | bl aes_decrypt_block4x | ||
82 | .endm | ||
83 | |||
84 | #else | ||
85 | #define FRAME_PUSH | ||
86 | #define FRAME_POP | ||
87 | |||
88 | .macro do_encrypt_block2x | ||
89 | encrypt_block2x v0, v1, w3, x2, x6, w7 | ||
90 | .endm | ||
91 | |||
92 | .macro do_decrypt_block2x | ||
93 | decrypt_block2x v0, v1, w3, x2, x6, w7 | ||
94 | .endm | ||
95 | |||
96 | .macro do_encrypt_block4x | ||
97 | encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
98 | .endm | ||
99 | |||
100 | .macro do_decrypt_block4x | ||
101 | decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 | ||
102 | .endm | ||
103 | |||
104 | #endif | ||
105 | |||
106 | /* | ||
107 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
108 | * int blocks, int first) | ||
109 | * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
110 | * int blocks, int first) | ||
111 | */ | ||
112 | |||
113 | AES_ENTRY(aes_ecb_encrypt) | ||
114 | FRAME_PUSH | ||
115 | cbz w5, .LecbencloopNx | ||
116 | |||
117 | enc_prepare w3, x2, x5 | ||
118 | |||
119 | .LecbencloopNx: | ||
120 | #if INTERLEAVE >= 2 | ||
121 | subs w4, w4, #INTERLEAVE | ||
122 | bmi .Lecbenc1x | ||
123 | #if INTERLEAVE == 2 | ||
124 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ | ||
125 | do_encrypt_block2x | ||
126 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
127 | #else | ||
128 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ | ||
129 | do_encrypt_block4x | ||
130 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
131 | #endif | ||
132 | b .LecbencloopNx | ||
133 | .Lecbenc1x: | ||
134 | adds w4, w4, #INTERLEAVE | ||
135 | beq .Lecbencout | ||
136 | #endif | ||
137 | .Lecbencloop: | ||
138 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ | ||
139 | encrypt_block v0, w3, x2, x5, w6 | ||
140 | st1 {v0.16b}, [x0], #16 | ||
141 | subs w4, w4, #1 | ||
142 | bne .Lecbencloop | ||
143 | .Lecbencout: | ||
144 | FRAME_POP | ||
145 | ret | ||
146 | AES_ENDPROC(aes_ecb_encrypt) | ||
147 | |||
148 | |||
149 | AES_ENTRY(aes_ecb_decrypt) | ||
150 | FRAME_PUSH | ||
151 | cbz w5, .LecbdecloopNx | ||
152 | |||
153 | dec_prepare w3, x2, x5 | ||
154 | |||
155 | .LecbdecloopNx: | ||
156 | #if INTERLEAVE >= 2 | ||
157 | subs w4, w4, #INTERLEAVE | ||
158 | bmi .Lecbdec1x | ||
159 | #if INTERLEAVE == 2 | ||
160 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
161 | do_decrypt_block2x | ||
162 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
163 | #else | ||
164 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
165 | do_decrypt_block4x | ||
166 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
167 | #endif | ||
168 | b .LecbdecloopNx | ||
169 | .Lecbdec1x: | ||
170 | adds w4, w4, #INTERLEAVE | ||
171 | beq .Lecbdecout | ||
172 | #endif | ||
173 | .Lecbdecloop: | ||
174 | ld1 {v0.16b}, [x1], #16 /* get next ct block */ | ||
175 | decrypt_block v0, w3, x2, x5, w6 | ||
176 | st1 {v0.16b}, [x0], #16 | ||
177 | subs w4, w4, #1 | ||
178 | bne .Lecbdecloop | ||
179 | .Lecbdecout: | ||
180 | FRAME_POP | ||
181 | ret | ||
182 | AES_ENDPROC(aes_ecb_decrypt) | ||
183 | |||
184 | |||
185 | /* | ||
186 | * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
187 | * int blocks, u8 iv[], int first) | ||
188 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
189 | * int blocks, u8 iv[], int first) | ||
190 | */ | ||
191 | |||
192 | AES_ENTRY(aes_cbc_encrypt) | ||
193 | cbz w6, .Lcbcencloop | ||
194 | |||
195 | ld1 {v0.16b}, [x5] /* get iv */ | ||
196 | enc_prepare w3, x2, x5 | ||
197 | |||
198 | .Lcbcencloop: | ||
199 | ld1 {v1.16b}, [x1], #16 /* get next pt block */ | ||
200 | eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ | ||
201 | encrypt_block v0, w3, x2, x5, w6 | ||
202 | st1 {v0.16b}, [x0], #16 | ||
203 | subs w4, w4, #1 | ||
204 | bne .Lcbcencloop | ||
205 | ret | ||
206 | AES_ENDPROC(aes_cbc_encrypt) | ||
207 | |||
208 | |||
209 | AES_ENTRY(aes_cbc_decrypt) | ||
210 | FRAME_PUSH | ||
211 | cbz w6, .LcbcdecloopNx | ||
212 | |||
213 | ld1 {v7.16b}, [x5] /* get iv */ | ||
214 | dec_prepare w3, x2, x5 | ||
215 | |||
216 | .LcbcdecloopNx: | ||
217 | #if INTERLEAVE >= 2 | ||
218 | subs w4, w4, #INTERLEAVE | ||
219 | bmi .Lcbcdec1x | ||
220 | #if INTERLEAVE == 2 | ||
221 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
222 | mov v2.16b, v0.16b | ||
223 | mov v3.16b, v1.16b | ||
224 | do_decrypt_block2x | ||
225 | eor v0.16b, v0.16b, v7.16b | ||
226 | eor v1.16b, v1.16b, v2.16b | ||
227 | mov v7.16b, v3.16b | ||
228 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
229 | #else | ||
230 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
231 | mov v4.16b, v0.16b | ||
232 | mov v5.16b, v1.16b | ||
233 | mov v6.16b, v2.16b | ||
234 | do_decrypt_block4x | ||
235 | sub x1, x1, #16 | ||
236 | eor v0.16b, v0.16b, v7.16b | ||
237 | eor v1.16b, v1.16b, v4.16b | ||
238 | ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ | ||
239 | eor v2.16b, v2.16b, v5.16b | ||
240 | eor v3.16b, v3.16b, v6.16b | ||
241 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
242 | #endif | ||
243 | b .LcbcdecloopNx | ||
244 | .Lcbcdec1x: | ||
245 | adds w4, w4, #INTERLEAVE | ||
246 | beq .Lcbcdecout | ||
247 | #endif | ||
248 | .Lcbcdecloop: | ||
249 | ld1 {v1.16b}, [x1], #16 /* get next ct block */ | ||
250 | mov v0.16b, v1.16b /* ...and copy to v0 */ | ||
251 | decrypt_block v0, w3, x2, x5, w6 | ||
252 | eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ | ||
253 | mov v7.16b, v1.16b /* ct is next iv */ | ||
254 | st1 {v0.16b}, [x0], #16 | ||
255 | subs w4, w4, #1 | ||
256 | bne .Lcbcdecloop | ||
257 | .Lcbcdecout: | ||
258 | FRAME_POP | ||
259 | ret | ||
260 | AES_ENDPROC(aes_cbc_decrypt) | ||
261 | |||
262 | |||
263 | /* | ||
264 | * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | ||
265 | * int blocks, u8 ctr[], int first) | ||
266 | */ | ||
267 | |||
268 | AES_ENTRY(aes_ctr_encrypt) | ||
269 | FRAME_PUSH | ||
270 | cbnz w6, .Lctrfirst /* 1st time around? */ | ||
271 | umov x5, v4.d[1] /* keep swabbed ctr in reg */ | ||
272 | rev x5, x5 | ||
273 | #if INTERLEAVE >= 2 | ||
274 | cmn w5, w4 /* 32 bit overflow? */ | ||
275 | bcs .Lctrinc | ||
276 | add x5, x5, #1 /* increment BE ctr */ | ||
277 | b .LctrincNx | ||
278 | #else | ||
279 | b .Lctrinc | ||
280 | #endif | ||
281 | .Lctrfirst: | ||
282 | enc_prepare w3, x2, x6 | ||
283 | ld1 {v4.16b}, [x5] | ||
284 | umov x5, v4.d[1] /* keep swabbed ctr in reg */ | ||
285 | rev x5, x5 | ||
286 | #if INTERLEAVE >= 2 | ||
287 | cmn w5, w4 /* 32 bit overflow? */ | ||
288 | bcs .Lctrloop | ||
289 | .LctrloopNx: | ||
290 | subs w4, w4, #INTERLEAVE | ||
291 | bmi .Lctr1x | ||
292 | #if INTERLEAVE == 2 | ||
293 | mov v0.8b, v4.8b | ||
294 | mov v1.8b, v4.8b | ||
295 | rev x7, x5 | ||
296 | add x5, x5, #1 | ||
297 | ins v0.d[1], x7 | ||
298 | rev x7, x5 | ||
299 | add x5, x5, #1 | ||
300 | ins v1.d[1], x7 | ||
301 | ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ | ||
302 | do_encrypt_block2x | ||
303 | eor v0.16b, v0.16b, v2.16b | ||
304 | eor v1.16b, v1.16b, v3.16b | ||
305 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
306 | #else | ||
307 | ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ | ||
308 | dup v7.4s, w5 | ||
309 | mov v0.16b, v4.16b | ||
310 | add v7.4s, v7.4s, v8.4s | ||
311 | mov v1.16b, v4.16b | ||
312 | rev32 v8.16b, v7.16b | ||
313 | mov v2.16b, v4.16b | ||
314 | mov v3.16b, v4.16b | ||
315 | mov v1.s[3], v8.s[0] | ||
316 | mov v2.s[3], v8.s[1] | ||
317 | mov v3.s[3], v8.s[2] | ||
318 | ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ | ||
319 | do_encrypt_block4x | ||
320 | eor v0.16b, v5.16b, v0.16b | ||
321 | ld1 {v5.16b}, [x1], #16 /* get 1 input block */ | ||
322 | eor v1.16b, v6.16b, v1.16b | ||
323 | eor v2.16b, v7.16b, v2.16b | ||
324 | eor v3.16b, v5.16b, v3.16b | ||
325 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
326 | add x5, x5, #INTERLEAVE | ||
327 | #endif | ||
328 | cbz w4, .LctroutNx | ||
329 | .LctrincNx: | ||
330 | rev x7, x5 | ||
331 | ins v4.d[1], x7 | ||
332 | b .LctrloopNx | ||
333 | .LctroutNx: | ||
334 | sub x5, x5, #1 | ||
335 | rev x7, x5 | ||
336 | ins v4.d[1], x7 | ||
337 | b .Lctrout | ||
338 | .Lctr1x: | ||
339 | adds w4, w4, #INTERLEAVE | ||
340 | beq .Lctrout | ||
341 | #endif | ||
342 | .Lctrloop: | ||
343 | mov v0.16b, v4.16b | ||
344 | encrypt_block v0, w3, x2, x6, w7 | ||
345 | subs w4, w4, #1 | ||
346 | bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ | ||
347 | ld1 {v3.16b}, [x1], #16 | ||
348 | eor v3.16b, v0.16b, v3.16b | ||
349 | st1 {v3.16b}, [x0], #16 | ||
350 | beq .Lctrout | ||
351 | .Lctrinc: | ||
352 | adds x5, x5, #1 /* increment BE ctr */ | ||
353 | rev x7, x5 | ||
354 | ins v4.d[1], x7 | ||
355 | bcc .Lctrloop /* no overflow? */ | ||
356 | umov x7, v4.d[0] /* load upper word of ctr */ | ||
357 | rev x7, x7 /* ... to handle the carry */ | ||
358 | add x7, x7, #1 | ||
359 | rev x7, x7 | ||
360 | ins v4.d[0], x7 | ||
361 | b .Lctrloop | ||
362 | .Lctrhalfblock: | ||
363 | ld1 {v3.8b}, [x1] | ||
364 | eor v3.8b, v0.8b, v3.8b | ||
365 | st1 {v3.8b}, [x0] | ||
366 | .Lctrout: | ||
367 | FRAME_POP | ||
368 | ret | ||
369 | AES_ENDPROC(aes_ctr_encrypt) | ||
370 | .ltorg | ||
371 | |||
372 | |||
373 | /* | ||
374 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | ||
375 | * int blocks, u8 const rk2[], u8 iv[], int first) | ||
376 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, | ||
377 | * int blocks, u8 const rk2[], u8 iv[], int first) | ||
378 | */ | ||
379 | |||
380 | .macro next_tweak, out, in, const, tmp | ||
381 | sshr \tmp\().2d, \in\().2d, #63 | ||
382 | and \tmp\().16b, \tmp\().16b, \const\().16b | ||
383 | add \out\().2d, \in\().2d, \in\().2d | ||
384 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 | ||
385 | eor \out\().16b, \out\().16b, \tmp\().16b | ||
386 | .endm | ||
387 | |||
388 | .Lxts_mul_x: | ||
389 | .word 1, 0, 0x87, 0 | ||
390 | |||
391 | AES_ENTRY(aes_xts_encrypt) | ||
392 | FRAME_PUSH | ||
393 | cbz w7, .LxtsencloopNx | ||
394 | |||
395 | ld1 {v4.16b}, [x6] | ||
396 | enc_prepare w3, x5, x6 | ||
397 | encrypt_block v4, w3, x5, x6, w7 /* first tweak */ | ||
398 | enc_switch_key w3, x2, x6 | ||
399 | ldr q7, .Lxts_mul_x | ||
400 | b .LxtsencNx | ||
401 | |||
402 | .LxtsencloopNx: | ||
403 | ldr q7, .Lxts_mul_x | ||
404 | next_tweak v4, v4, v7, v8 | ||
405 | .LxtsencNx: | ||
406 | #if INTERLEAVE >= 2 | ||
407 | subs w4, w4, #INTERLEAVE | ||
408 | bmi .Lxtsenc1x | ||
409 | #if INTERLEAVE == 2 | ||
410 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ | ||
411 | next_tweak v5, v4, v7, v8 | ||
412 | eor v0.16b, v0.16b, v4.16b | ||
413 | eor v1.16b, v1.16b, v5.16b | ||
414 | do_encrypt_block2x | ||
415 | eor v0.16b, v0.16b, v4.16b | ||
416 | eor v1.16b, v1.16b, v5.16b | ||
417 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
418 | cbz w4, .LxtsencoutNx | ||
419 | next_tweak v4, v5, v7, v8 | ||
420 | b .LxtsencNx | ||
421 | .LxtsencoutNx: | ||
422 | mov v4.16b, v5.16b | ||
423 | b .Lxtsencout | ||
424 | #else | ||
425 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ | ||
426 | next_tweak v5, v4, v7, v8 | ||
427 | eor v0.16b, v0.16b, v4.16b | ||
428 | next_tweak v6, v5, v7, v8 | ||
429 | eor v1.16b, v1.16b, v5.16b | ||
430 | eor v2.16b, v2.16b, v6.16b | ||
431 | next_tweak v7, v6, v7, v8 | ||
432 | eor v3.16b, v3.16b, v7.16b | ||
433 | do_encrypt_block4x | ||
434 | eor v3.16b, v3.16b, v7.16b | ||
435 | eor v0.16b, v0.16b, v4.16b | ||
436 | eor v1.16b, v1.16b, v5.16b | ||
437 | eor v2.16b, v2.16b, v6.16b | ||
438 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
439 | mov v4.16b, v7.16b | ||
440 | cbz w4, .Lxtsencout | ||
441 | b .LxtsencloopNx | ||
442 | #endif | ||
443 | .Lxtsenc1x: | ||
444 | adds w4, w4, #INTERLEAVE | ||
445 | beq .Lxtsencout | ||
446 | #endif | ||
447 | .Lxtsencloop: | ||
448 | ld1 {v1.16b}, [x1], #16 | ||
449 | eor v0.16b, v1.16b, v4.16b | ||
450 | encrypt_block v0, w3, x2, x6, w7 | ||
451 | eor v0.16b, v0.16b, v4.16b | ||
452 | st1 {v0.16b}, [x0], #16 | ||
453 | subs w4, w4, #1 | ||
454 | beq .Lxtsencout | ||
455 | next_tweak v4, v4, v7, v8 | ||
456 | b .Lxtsencloop | ||
457 | .Lxtsencout: | ||
458 | FRAME_POP | ||
459 | ret | ||
460 | AES_ENDPROC(aes_xts_encrypt) | ||
461 | |||
462 | |||
463 | AES_ENTRY(aes_xts_decrypt) | ||
464 | FRAME_PUSH | ||
465 | cbz w7, .LxtsdecloopNx | ||
466 | |||
467 | ld1 {v4.16b}, [x6] | ||
468 | enc_prepare w3, x5, x6 | ||
469 | encrypt_block v4, w3, x5, x6, w7 /* first tweak */ | ||
470 | dec_prepare w3, x2, x6 | ||
471 | ldr q7, .Lxts_mul_x | ||
472 | b .LxtsdecNx | ||
473 | |||
474 | .LxtsdecloopNx: | ||
475 | ldr q7, .Lxts_mul_x | ||
476 | next_tweak v4, v4, v7, v8 | ||
477 | .LxtsdecNx: | ||
478 | #if INTERLEAVE >= 2 | ||
479 | subs w4, w4, #INTERLEAVE | ||
480 | bmi .Lxtsdec1x | ||
481 | #if INTERLEAVE == 2 | ||
482 | ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ | ||
483 | next_tweak v5, v4, v7, v8 | ||
484 | eor v0.16b, v0.16b, v4.16b | ||
485 | eor v1.16b, v1.16b, v5.16b | ||
486 | do_decrypt_block2x | ||
487 | eor v0.16b, v0.16b, v4.16b | ||
488 | eor v1.16b, v1.16b, v5.16b | ||
489 | st1 {v0.16b-v1.16b}, [x0], #32 | ||
490 | cbz w4, .LxtsdecoutNx | ||
491 | next_tweak v4, v5, v7, v8 | ||
492 | b .LxtsdecNx | ||
493 | .LxtsdecoutNx: | ||
494 | mov v4.16b, v5.16b | ||
495 | b .Lxtsdecout | ||
496 | #else | ||
497 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ | ||
498 | next_tweak v5, v4, v7, v8 | ||
499 | eor v0.16b, v0.16b, v4.16b | ||
500 | next_tweak v6, v5, v7, v8 | ||
501 | eor v1.16b, v1.16b, v5.16b | ||
502 | eor v2.16b, v2.16b, v6.16b | ||
503 | next_tweak v7, v6, v7, v8 | ||
504 | eor v3.16b, v3.16b, v7.16b | ||
505 | do_decrypt_block4x | ||
506 | eor v3.16b, v3.16b, v7.16b | ||
507 | eor v0.16b, v0.16b, v4.16b | ||
508 | eor v1.16b, v1.16b, v5.16b | ||
509 | eor v2.16b, v2.16b, v6.16b | ||
510 | st1 {v0.16b-v3.16b}, [x0], #64 | ||
511 | mov v4.16b, v7.16b | ||
512 | cbz w4, .Lxtsdecout | ||
513 | b .LxtsdecloopNx | ||
514 | #endif | ||
515 | .Lxtsdec1x: | ||
516 | adds w4, w4, #INTERLEAVE | ||
517 | beq .Lxtsdecout | ||
518 | #endif | ||
519 | .Lxtsdecloop: | ||
520 | ld1 {v1.16b}, [x1], #16 | ||
521 | eor v0.16b, v1.16b, v4.16b | ||
522 | decrypt_block v0, w3, x2, x6, w7 | ||
523 | eor v0.16b, v0.16b, v4.16b | ||
524 | st1 {v0.16b}, [x0], #16 | ||
525 | subs w4, w4, #1 | ||
526 | beq .Lxtsdecout | ||
527 | next_tweak v4, v4, v7, v8 | ||
528 | b .Lxtsdecloop | ||
529 | .Lxtsdecout: | ||
530 | FRAME_POP | ||
531 | ret | ||
532 | AES_ENDPROC(aes_xts_decrypt) | ||
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S new file mode 100644 index 000000000000..b93170e1cc93 --- /dev/null +++ b/arch/arm64/crypto/aes-neon.S | |||
@@ -0,0 +1,382 @@ | |||
1 | /* | ||
2 | * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | |||
13 | #define AES_ENTRY(func) ENTRY(neon_ ## func) | ||
14 | #define AES_ENDPROC(func) ENDPROC(neon_ ## func) | ||
15 | |||
16 | /* multiply by polynomial 'x' in GF(2^8) */ | ||
17 | .macro mul_by_x, out, in, temp, const | ||
18 | sshr \temp, \in, #7 | ||
19 | add \out, \in, \in | ||
20 | and \temp, \temp, \const | ||
21 | eor \out, \out, \temp | ||
22 | .endm | ||
23 | |||
24 | /* preload the entire Sbox */ | ||
25 | .macro prepare, sbox, shiftrows, temp | ||
26 | adr \temp, \sbox | ||
27 | movi v12.16b, #0x40 | ||
28 | ldr q13, \shiftrows | ||
29 | movi v14.16b, #0x1b | ||
30 | ld1 {v16.16b-v19.16b}, [\temp], #64 | ||
31 | ld1 {v20.16b-v23.16b}, [\temp], #64 | ||
32 | ld1 {v24.16b-v27.16b}, [\temp], #64 | ||
33 | ld1 {v28.16b-v31.16b}, [\temp] | ||
34 | .endm | ||
35 | |||
36 | /* do preload for encryption */ | ||
37 | .macro enc_prepare, ignore0, ignore1, temp | ||
38 | prepare .LForward_Sbox, .LForward_ShiftRows, \temp | ||
39 | .endm | ||
40 | |||
41 | .macro enc_switch_key, ignore0, ignore1, temp | ||
42 | /* do nothing */ | ||
43 | .endm | ||
44 | |||
45 | /* do preload for decryption */ | ||
46 | .macro dec_prepare, ignore0, ignore1, temp | ||
47 | prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp | ||
48 | .endm | ||
49 | |||
50 | /* apply SubBytes transformation using the the preloaded Sbox */ | ||
51 | .macro sub_bytes, in | ||
52 | sub v9.16b, \in\().16b, v12.16b | ||
53 | tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b | ||
54 | sub v10.16b, v9.16b, v12.16b | ||
55 | tbx \in\().16b, {v20.16b-v23.16b}, v9.16b | ||
56 | sub v11.16b, v10.16b, v12.16b | ||
57 | tbx \in\().16b, {v24.16b-v27.16b}, v10.16b | ||
58 | tbx \in\().16b, {v28.16b-v31.16b}, v11.16b | ||
59 | .endm | ||
60 | |||
61 | /* apply MixColumns transformation */ | ||
62 | .macro mix_columns, in | ||
63 | mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b | ||
64 | rev32 v8.8h, \in\().8h | ||
65 | eor \in\().16b, v10.16b, \in\().16b | ||
66 | shl v9.4s, v8.4s, #24 | ||
67 | shl v11.4s, \in\().4s, #24 | ||
68 | sri v9.4s, v8.4s, #8 | ||
69 | sri v11.4s, \in\().4s, #8 | ||
70 | eor v9.16b, v9.16b, v8.16b | ||
71 | eor v10.16b, v10.16b, v9.16b | ||
72 | eor \in\().16b, v10.16b, v11.16b | ||
73 | .endm | ||
74 | |||
75 | /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ | ||
76 | .macro inv_mix_columns, in | ||
77 | mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b | ||
78 | mul_by_x v11.16b, v11.16b, v10.16b, v14.16b | ||
79 | eor \in\().16b, \in\().16b, v11.16b | ||
80 | rev32 v11.8h, v11.8h | ||
81 | eor \in\().16b, \in\().16b, v11.16b | ||
82 | mix_columns \in | ||
83 | .endm | ||
84 | |||
85 | .macro do_block, enc, in, rounds, rk, rkp, i | ||
86 | ld1 {v15.16b}, [\rk] | ||
87 | add \rkp, \rk, #16 | ||
88 | mov \i, \rounds | ||
89 | 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ | ||
90 | tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ | ||
91 | sub_bytes \in | ||
92 | ld1 {v15.16b}, [\rkp], #16 | ||
93 | subs \i, \i, #1 | ||
94 | beq 2222f | ||
95 | .if \enc == 1 | ||
96 | mix_columns \in | ||
97 | .else | ||
98 | inv_mix_columns \in | ||
99 | .endif | ||
100 | b 1111b | ||
101 | 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ | ||
102 | .endm | ||
103 | |||
104 | .macro encrypt_block, in, rounds, rk, rkp, i | ||
105 | do_block 1, \in, \rounds, \rk, \rkp, \i | ||
106 | .endm | ||
107 | |||
108 | .macro decrypt_block, in, rounds, rk, rkp, i | ||
109 | do_block 0, \in, \rounds, \rk, \rkp, \i | ||
110 | .endm | ||
111 | |||
112 | /* | ||
113 | * Interleaved versions: functionally equivalent to the | ||
114 | * ones above, but applied to 2 or 4 AES states in parallel. | ||
115 | */ | ||
116 | |||
117 | .macro sub_bytes_2x, in0, in1 | ||
118 | sub v8.16b, \in0\().16b, v12.16b | ||
119 | sub v9.16b, \in1\().16b, v12.16b | ||
120 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b | ||
121 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b | ||
122 | sub v10.16b, v8.16b, v12.16b | ||
123 | sub v11.16b, v9.16b, v12.16b | ||
124 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b | ||
125 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b | ||
126 | sub v8.16b, v10.16b, v12.16b | ||
127 | sub v9.16b, v11.16b, v12.16b | ||
128 | tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b | ||
129 | tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b | ||
130 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b | ||
131 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b | ||
132 | .endm | ||
133 | |||
134 | .macro sub_bytes_4x, in0, in1, in2, in3 | ||
135 | sub v8.16b, \in0\().16b, v12.16b | ||
136 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b | ||
137 | sub v9.16b, \in1\().16b, v12.16b | ||
138 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b | ||
139 | sub v10.16b, \in2\().16b, v12.16b | ||
140 | tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b | ||
141 | sub v11.16b, \in3\().16b, v12.16b | ||
142 | tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b | ||
143 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b | ||
144 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b | ||
145 | sub v8.16b, v8.16b, v12.16b | ||
146 | tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b | ||
147 | sub v9.16b, v9.16b, v12.16b | ||
148 | tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b | ||
149 | sub v10.16b, v10.16b, v12.16b | ||
150 | tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b | ||
151 | sub v11.16b, v11.16b, v12.16b | ||
152 | tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b | ||
153 | sub v8.16b, v8.16b, v12.16b | ||
154 | tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b | ||
155 | sub v9.16b, v9.16b, v12.16b | ||
156 | tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b | ||
157 | sub v10.16b, v10.16b, v12.16b | ||
158 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b | ||
159 | sub v11.16b, v11.16b, v12.16b | ||
160 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b | ||
161 | tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b | ||
162 | tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b | ||
163 | .endm | ||
164 | |||
165 | .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const | ||
166 | sshr \tmp0\().16b, \in0\().16b, #7 | ||
167 | add \out0\().16b, \in0\().16b, \in0\().16b | ||
168 | sshr \tmp1\().16b, \in1\().16b, #7 | ||
169 | and \tmp0\().16b, \tmp0\().16b, \const\().16b | ||
170 | add \out1\().16b, \in1\().16b, \in1\().16b | ||
171 | and \tmp1\().16b, \tmp1\().16b, \const\().16b | ||
172 | eor \out0\().16b, \out0\().16b, \tmp0\().16b | ||
173 | eor \out1\().16b, \out1\().16b, \tmp1\().16b | ||
174 | .endm | ||
175 | |||
176 | .macro mix_columns_2x, in0, in1 | ||
177 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
178 | rev32 v10.8h, \in0\().8h | ||
179 | rev32 v11.8h, \in1\().8h | ||
180 | eor \in0\().16b, v8.16b, \in0\().16b | ||
181 | eor \in1\().16b, v9.16b, \in1\().16b | ||
182 | shl v12.4s, v10.4s, #24 | ||
183 | shl v13.4s, v11.4s, #24 | ||
184 | eor v8.16b, v8.16b, v10.16b | ||
185 | sri v12.4s, v10.4s, #8 | ||
186 | shl v10.4s, \in0\().4s, #24 | ||
187 | eor v9.16b, v9.16b, v11.16b | ||
188 | sri v13.4s, v11.4s, #8 | ||
189 | shl v11.4s, \in1\().4s, #24 | ||
190 | sri v10.4s, \in0\().4s, #8 | ||
191 | eor \in0\().16b, v8.16b, v12.16b | ||
192 | sri v11.4s, \in1\().4s, #8 | ||
193 | eor \in1\().16b, v9.16b, v13.16b | ||
194 | eor \in0\().16b, v10.16b, \in0\().16b | ||
195 | eor \in1\().16b, v11.16b, \in1\().16b | ||
196 | .endm | ||
197 | |||
198 | .macro inv_mix_cols_2x, in0, in1 | ||
199 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
200 | mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 | ||
201 | eor \in0\().16b, \in0\().16b, v8.16b | ||
202 | eor \in1\().16b, \in1\().16b, v9.16b | ||
203 | rev32 v8.8h, v8.8h | ||
204 | rev32 v9.8h, v9.8h | ||
205 | eor \in0\().16b, \in0\().16b, v8.16b | ||
206 | eor \in1\().16b, \in1\().16b, v9.16b | ||
207 | mix_columns_2x \in0, \in1 | ||
208 | .endm | ||
209 | |||
210 | .macro inv_mix_cols_4x, in0, in1, in2, in3 | ||
211 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 | ||
212 | mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 | ||
213 | mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 | ||
214 | mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 | ||
215 | eor \in0\().16b, \in0\().16b, v8.16b | ||
216 | eor \in1\().16b, \in1\().16b, v9.16b | ||
217 | eor \in2\().16b, \in2\().16b, v10.16b | ||
218 | eor \in3\().16b, \in3\().16b, v11.16b | ||
219 | rev32 v8.8h, v8.8h | ||
220 | rev32 v9.8h, v9.8h | ||
221 | rev32 v10.8h, v10.8h | ||
222 | rev32 v11.8h, v11.8h | ||
223 | eor \in0\().16b, \in0\().16b, v8.16b | ||
224 | eor \in1\().16b, \in1\().16b, v9.16b | ||
225 | eor \in2\().16b, \in2\().16b, v10.16b | ||
226 | eor \in3\().16b, \in3\().16b, v11.16b | ||
227 | mix_columns_2x \in0, \in1 | ||
228 | mix_columns_2x \in2, \in3 | ||
229 | .endm | ||
230 | |||
231 | .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i | ||
232 | ld1 {v15.16b}, [\rk] | ||
233 | add \rkp, \rk, #16 | ||
234 | mov \i, \rounds | ||
235 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
236 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
237 | sub_bytes_2x \in0, \in1 | ||
238 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ | ||
239 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ | ||
240 | ld1 {v15.16b}, [\rkp], #16 | ||
241 | subs \i, \i, #1 | ||
242 | beq 2222f | ||
243 | .if \enc == 1 | ||
244 | mix_columns_2x \in0, \in1 | ||
245 | ldr q13, .LForward_ShiftRows | ||
246 | .else | ||
247 | inv_mix_cols_2x \in0, \in1 | ||
248 | ldr q13, .LReverse_ShiftRows | ||
249 | .endif | ||
250 | movi v12.16b, #0x40 | ||
251 | b 1111b | ||
252 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
253 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
254 | .endm | ||
255 | |||
256 | .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i | ||
257 | ld1 {v15.16b}, [\rk] | ||
258 | add \rkp, \rk, #16 | ||
259 | mov \i, \rounds | ||
260 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
261 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
262 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ | ||
263 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ | ||
264 | sub_bytes_4x \in0, \in1, \in2, \in3 | ||
265 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ | ||
266 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ | ||
267 | tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ | ||
268 | tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ | ||
269 | ld1 {v15.16b}, [\rkp], #16 | ||
270 | subs \i, \i, #1 | ||
271 | beq 2222f | ||
272 | .if \enc == 1 | ||
273 | mix_columns_2x \in0, \in1 | ||
274 | mix_columns_2x \in2, \in3 | ||
275 | ldr q13, .LForward_ShiftRows | ||
276 | .else | ||
277 | inv_mix_cols_4x \in0, \in1, \in2, \in3 | ||
278 | ldr q13, .LReverse_ShiftRows | ||
279 | .endif | ||
280 | movi v12.16b, #0x40 | ||
281 | b 1111b | ||
282 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ | ||
283 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ | ||
284 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ | ||
285 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ | ||
286 | .endm | ||
287 | |||
288 | .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i | ||
289 | do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i | ||
290 | .endm | ||
291 | |||
292 | .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i | ||
293 | do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i | ||
294 | .endm | ||
295 | |||
296 | .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i | ||
297 | do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i | ||
298 | .endm | ||
299 | |||
300 | .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i | ||
301 | do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i | ||
302 | .endm | ||
303 | |||
304 | #include "aes-modes.S" | ||
305 | |||
306 | .text | ||
307 | .align 4 | ||
308 | .LForward_ShiftRows: | ||
309 | .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 | ||
310 | .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb | ||
311 | |||
312 | .LReverse_ShiftRows: | ||
313 | .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb | ||
314 | .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 | ||
315 | |||
316 | .LForward_Sbox: | ||
317 | .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | ||
318 | .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | ||
319 | .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | ||
320 | .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | ||
321 | .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | ||
322 | .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | ||
323 | .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | ||
324 | .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | ||
325 | .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | ||
326 | .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | ||
327 | .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | ||
328 | .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | ||
329 | .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | ||
330 | .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | ||
331 | .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | ||
332 | .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | ||
333 | .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | ||
334 | .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | ||
335 | .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | ||
336 | .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | ||
337 | .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | ||
338 | .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | ||
339 | .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | ||
340 | .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | ||
341 | .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | ||
342 | .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | ||
343 | .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | ||
344 | .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | ||
345 | .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | ||
346 | .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | ||
347 | .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | ||
348 | .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | ||
349 | |||
350 | .LReverse_Sbox: | ||
351 | .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | ||
352 | .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | ||
353 | .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | ||
354 | .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | ||
355 | .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | ||
356 | .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | ||
357 | .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | ||
358 | .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | ||
359 | .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | ||
360 | .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | ||
361 | .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | ||
362 | .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | ||
363 | .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | ||
364 | .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | ||
365 | .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | ||
366 | .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | ||
367 | .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | ||
368 | .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | ||
369 | .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | ||
370 | .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | ||
371 | .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | ||
372 | .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | ||
373 | .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | ||
374 | .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | ||
375 | .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | ||
376 | .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | ||
377 | .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | ||
378 | .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | ||
379 | .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | ||
380 | .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | ||
381 | .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | ||
382 | .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | ||
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 000000000000..b9e6eaf41c9b --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S | |||
@@ -0,0 +1,95 @@ | |||
1 | /* | ||
2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S | ||
7 | * | ||
8 | * Copyright (c) 2009 Intel Corp. | ||
9 | * Author: Huang Ying <ying.huang@intel.com> | ||
10 | * Vinodh Gopal | ||
11 | * Erdinc Ozturk | ||
12 | * Deniz Karakoyunlu | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify it | ||
15 | * under the terms of the GNU General Public License version 2 as published | ||
16 | * by the Free Software Foundation. | ||
17 | */ | ||
18 | |||
19 | #include <linux/linkage.h> | ||
20 | #include <asm/assembler.h> | ||
21 | |||
22 | DATA .req v0 | ||
23 | SHASH .req v1 | ||
24 | IN1 .req v2 | ||
25 | T1 .req v2 | ||
26 | T2 .req v3 | ||
27 | T3 .req v4 | ||
28 | VZR .req v5 | ||
29 | |||
30 | .text | ||
31 | .arch armv8-a+crypto | ||
32 | |||
33 | /* | ||
34 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, | ||
35 | * struct ghash_key const *k, const char *head) | ||
36 | */ | ||
37 | ENTRY(pmull_ghash_update) | ||
38 | ld1 {DATA.16b}, [x1] | ||
39 | ld1 {SHASH.16b}, [x3] | ||
40 | eor VZR.16b, VZR.16b, VZR.16b | ||
41 | |||
42 | /* do the head block first, if supplied */ | ||
43 | cbz x4, 0f | ||
44 | ld1 {IN1.2d}, [x4] | ||
45 | b 1f | ||
46 | |||
47 | 0: ld1 {IN1.2d}, [x2], #16 | ||
48 | sub w0, w0, #1 | ||
49 | 1: ext IN1.16b, IN1.16b, IN1.16b, #8 | ||
50 | CPU_LE( rev64 IN1.16b, IN1.16b ) | ||
51 | eor DATA.16b, DATA.16b, IN1.16b | ||
52 | |||
53 | /* multiply DATA by SHASH in GF(2^128) */ | ||
54 | ext T2.16b, DATA.16b, DATA.16b, #8 | ||
55 | ext T3.16b, SHASH.16b, SHASH.16b, #8 | ||
56 | eor T2.16b, T2.16b, DATA.16b | ||
57 | eor T3.16b, T3.16b, SHASH.16b | ||
58 | |||
59 | pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 | ||
60 | pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 | ||
61 | pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) | ||
62 | eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) | ||
63 | eor T2.16b, T2.16b, DATA.16b | ||
64 | |||
65 | ext T3.16b, VZR.16b, T2.16b, #8 | ||
66 | ext T2.16b, T2.16b, VZR.16b, #8 | ||
67 | eor DATA.16b, DATA.16b, T3.16b | ||
68 | eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of | ||
69 | // carry-less multiplication | ||
70 | |||
71 | /* first phase of the reduction */ | ||
72 | shl T3.2d, DATA.2d, #1 | ||
73 | eor T3.16b, T3.16b, DATA.16b | ||
74 | shl T3.2d, T3.2d, #5 | ||
75 | eor T3.16b, T3.16b, DATA.16b | ||
76 | shl T3.2d, T3.2d, #57 | ||
77 | ext T2.16b, VZR.16b, T3.16b, #8 | ||
78 | ext T3.16b, T3.16b, VZR.16b, #8 | ||
79 | eor DATA.16b, DATA.16b, T2.16b | ||
80 | eor T1.16b, T1.16b, T3.16b | ||
81 | |||
82 | /* second phase of the reduction */ | ||
83 | ushr T2.2d, DATA.2d, #5 | ||
84 | eor T2.16b, T2.16b, DATA.16b | ||
85 | ushr T2.2d, T2.2d, #1 | ||
86 | eor T2.16b, T2.16b, DATA.16b | ||
87 | ushr T2.2d, T2.2d, #1 | ||
88 | eor T1.16b, T1.16b, T2.16b | ||
89 | eor DATA.16b, DATA.16b, T1.16b | ||
90 | |||
91 | cbnz w0, 0b | ||
92 | |||
93 | st1 {DATA.16b}, [x1] | ||
94 | ret | ||
95 | ENDPROC(pmull_ghash_update) | ||
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c new file mode 100644 index 000000000000..b92baf3f68c7 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-glue.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of the GNU General Public License version 2 as published | ||
8 | * by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/internal/hash.h> | ||
14 | #include <linux/cpufeature.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/module.h> | ||
17 | |||
18 | MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); | ||
19 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
20 | MODULE_LICENSE("GPL v2"); | ||
21 | |||
22 | #define GHASH_BLOCK_SIZE 16 | ||
23 | #define GHASH_DIGEST_SIZE 16 | ||
24 | |||
25 | struct ghash_key { | ||
26 | u64 a; | ||
27 | u64 b; | ||
28 | }; | ||
29 | |||
30 | struct ghash_desc_ctx { | ||
31 | u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; | ||
32 | u8 buf[GHASH_BLOCK_SIZE]; | ||
33 | u32 count; | ||
34 | }; | ||
35 | |||
36 | asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, | ||
37 | struct ghash_key const *k, const char *head); | ||
38 | |||
39 | static int ghash_init(struct shash_desc *desc) | ||
40 | { | ||
41 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
42 | |||
43 | *ctx = (struct ghash_desc_ctx){}; | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | static int ghash_update(struct shash_desc *desc, const u8 *src, | ||
48 | unsigned int len) | ||
49 | { | ||
50 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
51 | unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; | ||
52 | |||
53 | ctx->count += len; | ||
54 | |||
55 | if ((partial + len) >= GHASH_BLOCK_SIZE) { | ||
56 | struct ghash_key *key = crypto_shash_ctx(desc->tfm); | ||
57 | int blocks; | ||
58 | |||
59 | if (partial) { | ||
60 | int p = GHASH_BLOCK_SIZE - partial; | ||
61 | |||
62 | memcpy(ctx->buf + partial, src, p); | ||
63 | src += p; | ||
64 | len -= p; | ||
65 | } | ||
66 | |||
67 | blocks = len / GHASH_BLOCK_SIZE; | ||
68 | len %= GHASH_BLOCK_SIZE; | ||
69 | |||
70 | kernel_neon_begin_partial(6); | ||
71 | pmull_ghash_update(blocks, ctx->digest, src, key, | ||
72 | partial ? ctx->buf : NULL); | ||
73 | kernel_neon_end(); | ||
74 | src += blocks * GHASH_BLOCK_SIZE; | ||
75 | } | ||
76 | if (len) | ||
77 | memcpy(ctx->buf + partial, src, len); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static int ghash_final(struct shash_desc *desc, u8 *dst) | ||
82 | { | ||
83 | struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); | ||
84 | unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; | ||
85 | |||
86 | if (partial) { | ||
87 | struct ghash_key *key = crypto_shash_ctx(desc->tfm); | ||
88 | |||
89 | memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); | ||
90 | |||
91 | kernel_neon_begin_partial(6); | ||
92 | pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); | ||
93 | kernel_neon_end(); | ||
94 | } | ||
95 | put_unaligned_be64(ctx->digest[1], dst); | ||
96 | put_unaligned_be64(ctx->digest[0], dst + 8); | ||
97 | |||
98 | *ctx = (struct ghash_desc_ctx){}; | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | static int ghash_setkey(struct crypto_shash *tfm, | ||
103 | const u8 *inkey, unsigned int keylen) | ||
104 | { | ||
105 | struct ghash_key *key = crypto_shash_ctx(tfm); | ||
106 | u64 a, b; | ||
107 | |||
108 | if (keylen != GHASH_BLOCK_SIZE) { | ||
109 | crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
110 | return -EINVAL; | ||
111 | } | ||
112 | |||
113 | /* perform multiplication by 'x' in GF(2^128) */ | ||
114 | b = get_unaligned_be64(inkey); | ||
115 | a = get_unaligned_be64(inkey + 8); | ||
116 | |||
117 | key->a = (a << 1) | (b >> 63); | ||
118 | key->b = (b << 1) | (a >> 63); | ||
119 | |||
120 | if (b >> 63) | ||
121 | key->b ^= 0xc200000000000000UL; | ||
122 | |||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | static struct shash_alg ghash_alg = { | ||
127 | .digestsize = GHASH_DIGEST_SIZE, | ||
128 | .init = ghash_init, | ||
129 | .update = ghash_update, | ||
130 | .final = ghash_final, | ||
131 | .setkey = ghash_setkey, | ||
132 | .descsize = sizeof(struct ghash_desc_ctx), | ||
133 | .base = { | ||
134 | .cra_name = "ghash", | ||
135 | .cra_driver_name = "ghash-ce", | ||
136 | .cra_priority = 200, | ||
137 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
138 | .cra_blocksize = GHASH_BLOCK_SIZE, | ||
139 | .cra_ctxsize = sizeof(struct ghash_key), | ||
140 | .cra_module = THIS_MODULE, | ||
141 | }, | ||
142 | }; | ||
143 | |||
144 | static int __init ghash_ce_mod_init(void) | ||
145 | { | ||
146 | return crypto_register_shash(&ghash_alg); | ||
147 | } | ||
148 | |||
149 | static void __exit ghash_ce_mod_exit(void) | ||
150 | { | ||
151 | crypto_unregister_shash(&ghash_alg); | ||
152 | } | ||
153 | |||
154 | module_cpu_feature_match(PMULL, ghash_ce_mod_init); | ||
155 | module_exit(ghash_ce_mod_exit); | ||
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S new file mode 100644 index 000000000000..09d57d98609c --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-core.S | |||
@@ -0,0 +1,153 @@ | |||
1 | /* | ||
2 | * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | #include <asm/assembler.h> | ||
13 | |||
14 | .text | ||
15 | .arch armv8-a+crypto | ||
16 | |||
17 | k0 .req v0 | ||
18 | k1 .req v1 | ||
19 | k2 .req v2 | ||
20 | k3 .req v3 | ||
21 | |||
22 | t0 .req v4 | ||
23 | t1 .req v5 | ||
24 | |||
25 | dga .req q6 | ||
26 | dgav .req v6 | ||
27 | dgb .req s7 | ||
28 | dgbv .req v7 | ||
29 | |||
30 | dg0q .req q12 | ||
31 | dg0s .req s12 | ||
32 | dg0v .req v12 | ||
33 | dg1s .req s13 | ||
34 | dg1v .req v13 | ||
35 | dg2s .req s14 | ||
36 | |||
37 | .macro add_only, op, ev, rc, s0, dg1 | ||
38 | .ifc \ev, ev | ||
39 | add t1.4s, v\s0\().4s, \rc\().4s | ||
40 | sha1h dg2s, dg0s | ||
41 | .ifnb \dg1 | ||
42 | sha1\op dg0q, \dg1, t0.4s | ||
43 | .else | ||
44 | sha1\op dg0q, dg1s, t0.4s | ||
45 | .endif | ||
46 | .else | ||
47 | .ifnb \s0 | ||
48 | add t0.4s, v\s0\().4s, \rc\().4s | ||
49 | .endif | ||
50 | sha1h dg1s, dg0s | ||
51 | sha1\op dg0q, dg2s, t1.4s | ||
52 | .endif | ||
53 | .endm | ||
54 | |||
55 | .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 | ||
56 | sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s | ||
57 | add_only \op, \ev, \rc, \s1, \dg1 | ||
58 | sha1su1 v\s0\().4s, v\s3\().4s | ||
59 | .endm | ||
60 | |||
61 | /* | ||
62 | * The SHA1 round constants | ||
63 | */ | ||
64 | .align 4 | ||
65 | .Lsha1_rcon: | ||
66 | .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 | ||
67 | |||
68 | /* | ||
69 | * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, | ||
70 | * u8 *head, long bytes) | ||
71 | */ | ||
72 | ENTRY(sha1_ce_transform) | ||
73 | /* load round constants */ | ||
74 | adr x6, .Lsha1_rcon | ||
75 | ld1r {k0.4s}, [x6], #4 | ||
76 | ld1r {k1.4s}, [x6], #4 | ||
77 | ld1r {k2.4s}, [x6], #4 | ||
78 | ld1r {k3.4s}, [x6] | ||
79 | |||
80 | /* load state */ | ||
81 | ldr dga, [x2] | ||
82 | ldr dgb, [x2, #16] | ||
83 | |||
84 | /* load partial state (if supplied) */ | ||
85 | cbz x3, 0f | ||
86 | ld1 {v8.4s-v11.4s}, [x3] | ||
87 | b 1f | ||
88 | |||
89 | /* load input */ | ||
90 | 0: ld1 {v8.4s-v11.4s}, [x1], #64 | ||
91 | sub w0, w0, #1 | ||
92 | |||
93 | 1: | ||
94 | CPU_LE( rev32 v8.16b, v8.16b ) | ||
95 | CPU_LE( rev32 v9.16b, v9.16b ) | ||
96 | CPU_LE( rev32 v10.16b, v10.16b ) | ||
97 | CPU_LE( rev32 v11.16b, v11.16b ) | ||
98 | |||
99 | 2: add t0.4s, v8.4s, k0.4s | ||
100 | mov dg0v.16b, dgav.16b | ||
101 | |||
102 | add_update c, ev, k0, 8, 9, 10, 11, dgb | ||
103 | add_update c, od, k0, 9, 10, 11, 8 | ||
104 | add_update c, ev, k0, 10, 11, 8, 9 | ||
105 | add_update c, od, k0, 11, 8, 9, 10 | ||
106 | add_update c, ev, k1, 8, 9, 10, 11 | ||
107 | |||
108 | add_update p, od, k1, 9, 10, 11, 8 | ||
109 | add_update p, ev, k1, 10, 11, 8, 9 | ||
110 | add_update p, od, k1, 11, 8, 9, 10 | ||
111 | add_update p, ev, k1, 8, 9, 10, 11 | ||
112 | add_update p, od, k2, 9, 10, 11, 8 | ||
113 | |||
114 | add_update m, ev, k2, 10, 11, 8, 9 | ||
115 | add_update m, od, k2, 11, 8, 9, 10 | ||
116 | add_update m, ev, k2, 8, 9, 10, 11 | ||
117 | add_update m, od, k2, 9, 10, 11, 8 | ||
118 | add_update m, ev, k3, 10, 11, 8, 9 | ||
119 | |||
120 | add_update p, od, k3, 11, 8, 9, 10 | ||
121 | add_only p, ev, k3, 9 | ||
122 | add_only p, od, k3, 10 | ||
123 | add_only p, ev, k3, 11 | ||
124 | add_only p, od | ||
125 | |||
126 | /* update state */ | ||
127 | add dgbv.2s, dgbv.2s, dg1v.2s | ||
128 | add dgav.4s, dgav.4s, dg0v.4s | ||
129 | |||
130 | cbnz w0, 0b | ||
131 | |||
132 | /* | ||
133 | * Final block: add padding and total bit count. | ||
134 | * Skip if we have no total byte count in x4. In that case, the input | ||
135 | * size was not a round multiple of the block size, and the padding is | ||
136 | * handled by the C code. | ||
137 | */ | ||
138 | cbz x4, 3f | ||
139 | movi v9.2d, #0 | ||
140 | mov x8, #0x80000000 | ||
141 | movi v10.2d, #0 | ||
142 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) | ||
143 | fmov d8, x8 | ||
144 | mov x4, #0 | ||
145 | mov v11.d[0], xzr | ||
146 | mov v11.d[1], x7 | ||
147 | b 2b | ||
148 | |||
149 | /* store new state */ | ||
150 | 3: str dga, [x2] | ||
151 | str dgb, [x2, #16] | ||
152 | ret | ||
153 | ENDPROC(sha1_ce_transform) | ||
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c new file mode 100644 index 000000000000..6fe83f37a750 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-glue.c | |||
@@ -0,0 +1,174 @@ | |||
1 | /* | ||
2 | * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/internal/hash.h> | ||
14 | #include <crypto/sha.h> | ||
15 | #include <linux/cpufeature.h> | ||
16 | #include <linux/crypto.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); | ||
20 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
21 | MODULE_LICENSE("GPL v2"); | ||
22 | |||
23 | asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, | ||
24 | u8 *head, long bytes); | ||
25 | |||
26 | static int sha1_init(struct shash_desc *desc) | ||
27 | { | ||
28 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
29 | |||
30 | *sctx = (struct sha1_state){ | ||
31 | .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, | ||
32 | }; | ||
33 | return 0; | ||
34 | } | ||
35 | |||
36 | static int sha1_update(struct shash_desc *desc, const u8 *data, | ||
37 | unsigned int len) | ||
38 | { | ||
39 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
40 | unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; | ||
41 | |||
42 | sctx->count += len; | ||
43 | |||
44 | if ((partial + len) >= SHA1_BLOCK_SIZE) { | ||
45 | int blocks; | ||
46 | |||
47 | if (partial) { | ||
48 | int p = SHA1_BLOCK_SIZE - partial; | ||
49 | |||
50 | memcpy(sctx->buffer + partial, data, p); | ||
51 | data += p; | ||
52 | len -= p; | ||
53 | } | ||
54 | |||
55 | blocks = len / SHA1_BLOCK_SIZE; | ||
56 | len %= SHA1_BLOCK_SIZE; | ||
57 | |||
58 | kernel_neon_begin_partial(16); | ||
59 | sha1_ce_transform(blocks, data, sctx->state, | ||
60 | partial ? sctx->buffer : NULL, 0); | ||
61 | kernel_neon_end(); | ||
62 | |||
63 | data += blocks * SHA1_BLOCK_SIZE; | ||
64 | partial = 0; | ||
65 | } | ||
66 | if (len) | ||
67 | memcpy(sctx->buffer + partial, data, len); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static int sha1_final(struct shash_desc *desc, u8 *out) | ||
72 | { | ||
73 | static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; | ||
74 | |||
75 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
76 | __be64 bits = cpu_to_be64(sctx->count << 3); | ||
77 | __be32 *dst = (__be32 *)out; | ||
78 | int i; | ||
79 | |||
80 | u32 padlen = SHA1_BLOCK_SIZE | ||
81 | - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); | ||
82 | |||
83 | sha1_update(desc, padding, padlen); | ||
84 | sha1_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
85 | |||
86 | for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) | ||
87 | put_unaligned_be32(sctx->state[i], dst++); | ||
88 | |||
89 | *sctx = (struct sha1_state){}; | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | static int sha1_finup(struct shash_desc *desc, const u8 *data, | ||
94 | unsigned int len, u8 *out) | ||
95 | { | ||
96 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
97 | __be32 *dst = (__be32 *)out; | ||
98 | int blocks; | ||
99 | int i; | ||
100 | |||
101 | if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { | ||
102 | sha1_update(desc, data, len); | ||
103 | return sha1_final(desc, out); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * Use a fast path if the input is a multiple of 64 bytes. In | ||
108 | * this case, there is no need to copy data around, and we can | ||
109 | * perform the entire digest calculation in a single invocation | ||
110 | * of sha1_ce_transform() | ||
111 | */ | ||
112 | blocks = len / SHA1_BLOCK_SIZE; | ||
113 | |||
114 | kernel_neon_begin_partial(16); | ||
115 | sha1_ce_transform(blocks, data, sctx->state, NULL, len); | ||
116 | kernel_neon_end(); | ||
117 | |||
118 | for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) | ||
119 | put_unaligned_be32(sctx->state[i], dst++); | ||
120 | |||
121 | *sctx = (struct sha1_state){}; | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | static int sha1_export(struct shash_desc *desc, void *out) | ||
126 | { | ||
127 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
128 | struct sha1_state *dst = out; | ||
129 | |||
130 | *dst = *sctx; | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | static int sha1_import(struct shash_desc *desc, const void *in) | ||
135 | { | ||
136 | struct sha1_state *sctx = shash_desc_ctx(desc); | ||
137 | struct sha1_state const *src = in; | ||
138 | |||
139 | *sctx = *src; | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | static struct shash_alg alg = { | ||
144 | .init = sha1_init, | ||
145 | .update = sha1_update, | ||
146 | .final = sha1_final, | ||
147 | .finup = sha1_finup, | ||
148 | .export = sha1_export, | ||
149 | .import = sha1_import, | ||
150 | .descsize = sizeof(struct sha1_state), | ||
151 | .digestsize = SHA1_DIGEST_SIZE, | ||
152 | .statesize = sizeof(struct sha1_state), | ||
153 | .base = { | ||
154 | .cra_name = "sha1", | ||
155 | .cra_driver_name = "sha1-ce", | ||
156 | .cra_priority = 200, | ||
157 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
158 | .cra_blocksize = SHA1_BLOCK_SIZE, | ||
159 | .cra_module = THIS_MODULE, | ||
160 | } | ||
161 | }; | ||
162 | |||
163 | static int __init sha1_ce_mod_init(void) | ||
164 | { | ||
165 | return crypto_register_shash(&alg); | ||
166 | } | ||
167 | |||
168 | static void __exit sha1_ce_mod_fini(void) | ||
169 | { | ||
170 | crypto_unregister_shash(&alg); | ||
171 | } | ||
172 | |||
173 | module_cpu_feature_match(SHA1, sha1_ce_mod_init); | ||
174 | module_exit(sha1_ce_mod_fini); | ||
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S new file mode 100644 index 000000000000..7f29fc031ea8 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-core.S | |||
@@ -0,0 +1,156 @@ | |||
1 | /* | ||
2 | * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | #include <asm/assembler.h> | ||
13 | |||
14 | .text | ||
15 | .arch armv8-a+crypto | ||
16 | |||
17 | dga .req q20 | ||
18 | dgav .req v20 | ||
19 | dgb .req q21 | ||
20 | dgbv .req v21 | ||
21 | |||
22 | t0 .req v22 | ||
23 | t1 .req v23 | ||
24 | |||
25 | dg0q .req q24 | ||
26 | dg0v .req v24 | ||
27 | dg1q .req q25 | ||
28 | dg1v .req v25 | ||
29 | dg2q .req q26 | ||
30 | dg2v .req v26 | ||
31 | |||
32 | .macro add_only, ev, rc, s0 | ||
33 | mov dg2v.16b, dg0v.16b | ||
34 | .ifeq \ev | ||
35 | add t1.4s, v\s0\().4s, \rc\().4s | ||
36 | sha256h dg0q, dg1q, t0.4s | ||
37 | sha256h2 dg1q, dg2q, t0.4s | ||
38 | .else | ||
39 | .ifnb \s0 | ||
40 | add t0.4s, v\s0\().4s, \rc\().4s | ||
41 | .endif | ||
42 | sha256h dg0q, dg1q, t1.4s | ||
43 | sha256h2 dg1q, dg2q, t1.4s | ||
44 | .endif | ||
45 | .endm | ||
46 | |||
47 | .macro add_update, ev, rc, s0, s1, s2, s3 | ||
48 | sha256su0 v\s0\().4s, v\s1\().4s | ||
49 | add_only \ev, \rc, \s1 | ||
50 | sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s | ||
51 | .endm | ||
52 | |||
53 | /* | ||
54 | * The SHA-256 round constants | ||
55 | */ | ||
56 | .align 4 | ||
57 | .Lsha2_rcon: | ||
58 | .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 | ||
59 | .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 | ||
60 | .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 | ||
61 | .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 | ||
62 | .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc | ||
63 | .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da | ||
64 | .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 | ||
65 | .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 | ||
66 | .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 | ||
67 | .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 | ||
68 | .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 | ||
69 | .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 | ||
70 | .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 | ||
71 | .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 | ||
72 | .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 | ||
73 | .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | ||
74 | |||
75 | /* | ||
76 | * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, | ||
77 | * u8 *head, long bytes) | ||
78 | */ | ||
79 | ENTRY(sha2_ce_transform) | ||
80 | /* load round constants */ | ||
81 | adr x8, .Lsha2_rcon | ||
82 | ld1 { v0.4s- v3.4s}, [x8], #64 | ||
83 | ld1 { v4.4s- v7.4s}, [x8], #64 | ||
84 | ld1 { v8.4s-v11.4s}, [x8], #64 | ||
85 | ld1 {v12.4s-v15.4s}, [x8] | ||
86 | |||
87 | /* load state */ | ||
88 | ldp dga, dgb, [x2] | ||
89 | |||
90 | /* load partial input (if supplied) */ | ||
91 | cbz x3, 0f | ||
92 | ld1 {v16.4s-v19.4s}, [x3] | ||
93 | b 1f | ||
94 | |||
95 | /* load input */ | ||
96 | 0: ld1 {v16.4s-v19.4s}, [x1], #64 | ||
97 | sub w0, w0, #1 | ||
98 | |||
99 | 1: | ||
100 | CPU_LE( rev32 v16.16b, v16.16b ) | ||
101 | CPU_LE( rev32 v17.16b, v17.16b ) | ||
102 | CPU_LE( rev32 v18.16b, v18.16b ) | ||
103 | CPU_LE( rev32 v19.16b, v19.16b ) | ||
104 | |||
105 | 2: add t0.4s, v16.4s, v0.4s | ||
106 | mov dg0v.16b, dgav.16b | ||
107 | mov dg1v.16b, dgbv.16b | ||
108 | |||
109 | add_update 0, v1, 16, 17, 18, 19 | ||
110 | add_update 1, v2, 17, 18, 19, 16 | ||
111 | add_update 0, v3, 18, 19, 16, 17 | ||
112 | add_update 1, v4, 19, 16, 17, 18 | ||
113 | |||
114 | add_update 0, v5, 16, 17, 18, 19 | ||
115 | add_update 1, v6, 17, 18, 19, 16 | ||
116 | add_update 0, v7, 18, 19, 16, 17 | ||
117 | add_update 1, v8, 19, 16, 17, 18 | ||
118 | |||
119 | add_update 0, v9, 16, 17, 18, 19 | ||
120 | add_update 1, v10, 17, 18, 19, 16 | ||
121 | add_update 0, v11, 18, 19, 16, 17 | ||
122 | add_update 1, v12, 19, 16, 17, 18 | ||
123 | |||
124 | add_only 0, v13, 17 | ||
125 | add_only 1, v14, 18 | ||
126 | add_only 0, v15, 19 | ||
127 | add_only 1 | ||
128 | |||
129 | /* update state */ | ||
130 | add dgav.4s, dgav.4s, dg0v.4s | ||
131 | add dgbv.4s, dgbv.4s, dg1v.4s | ||
132 | |||
133 | /* handled all input blocks? */ | ||
134 | cbnz w0, 0b | ||
135 | |||
136 | /* | ||
137 | * Final block: add padding and total bit count. | ||
138 | * Skip if we have no total byte count in x4. In that case, the input | ||
139 | * size was not a round multiple of the block size, and the padding is | ||
140 | * handled by the C code. | ||
141 | */ | ||
142 | cbz x4, 3f | ||
143 | movi v17.2d, #0 | ||
144 | mov x8, #0x80000000 | ||
145 | movi v18.2d, #0 | ||
146 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) | ||
147 | fmov d16, x8 | ||
148 | mov x4, #0 | ||
149 | mov v19.d[0], xzr | ||
150 | mov v19.d[1], x7 | ||
151 | b 2b | ||
152 | |||
153 | /* store new state */ | ||
154 | 3: stp dga, dgb, [x2] | ||
155 | ret | ||
156 | ENDPROC(sha2_ce_transform) | ||
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c new file mode 100644 index 000000000000..c294e67d3925 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-glue.c | |||
@@ -0,0 +1,255 @@ | |||
1 | /* | ||
2 | * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions | ||
3 | * | ||
4 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <asm/neon.h> | ||
12 | #include <asm/unaligned.h> | ||
13 | #include <crypto/internal/hash.h> | ||
14 | #include <crypto/sha.h> | ||
15 | #include <linux/cpufeature.h> | ||
16 | #include <linux/crypto.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); | ||
20 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
21 | MODULE_LICENSE("GPL v2"); | ||
22 | |||
23 | asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, | ||
24 | u8 *head, long bytes); | ||
25 | |||
26 | static int sha224_init(struct shash_desc *desc) | ||
27 | { | ||
28 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
29 | |||
30 | *sctx = (struct sha256_state){ | ||
31 | .state = { | ||
32 | SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, | ||
33 | SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, | ||
34 | } | ||
35 | }; | ||
36 | return 0; | ||
37 | } | ||
38 | |||
39 | static int sha256_init(struct shash_desc *desc) | ||
40 | { | ||
41 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
42 | |||
43 | *sctx = (struct sha256_state){ | ||
44 | .state = { | ||
45 | SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, | ||
46 | SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, | ||
47 | } | ||
48 | }; | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | static int sha2_update(struct shash_desc *desc, const u8 *data, | ||
53 | unsigned int len) | ||
54 | { | ||
55 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
56 | unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; | ||
57 | |||
58 | sctx->count += len; | ||
59 | |||
60 | if ((partial + len) >= SHA256_BLOCK_SIZE) { | ||
61 | int blocks; | ||
62 | |||
63 | if (partial) { | ||
64 | int p = SHA256_BLOCK_SIZE - partial; | ||
65 | |||
66 | memcpy(sctx->buf + partial, data, p); | ||
67 | data += p; | ||
68 | len -= p; | ||
69 | } | ||
70 | |||
71 | blocks = len / SHA256_BLOCK_SIZE; | ||
72 | len %= SHA256_BLOCK_SIZE; | ||
73 | |||
74 | kernel_neon_begin_partial(28); | ||
75 | sha2_ce_transform(blocks, data, sctx->state, | ||
76 | partial ? sctx->buf : NULL, 0); | ||
77 | kernel_neon_end(); | ||
78 | |||
79 | data += blocks * SHA256_BLOCK_SIZE; | ||
80 | partial = 0; | ||
81 | } | ||
82 | if (len) | ||
83 | memcpy(sctx->buf + partial, data, len); | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | static void sha2_final(struct shash_desc *desc) | ||
88 | { | ||
89 | static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; | ||
90 | |||
91 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
92 | __be64 bits = cpu_to_be64(sctx->count << 3); | ||
93 | u32 padlen = SHA256_BLOCK_SIZE | ||
94 | - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); | ||
95 | |||
96 | sha2_update(desc, padding, padlen); | ||
97 | sha2_update(desc, (const u8 *)&bits, sizeof(bits)); | ||
98 | } | ||
99 | |||
100 | static int sha224_final(struct shash_desc *desc, u8 *out) | ||
101 | { | ||
102 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
103 | __be32 *dst = (__be32 *)out; | ||
104 | int i; | ||
105 | |||
106 | sha2_final(desc); | ||
107 | |||
108 | for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) | ||
109 | put_unaligned_be32(sctx->state[i], dst++); | ||
110 | |||
111 | *sctx = (struct sha256_state){}; | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static int sha256_final(struct shash_desc *desc, u8 *out) | ||
116 | { | ||
117 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
118 | __be32 *dst = (__be32 *)out; | ||
119 | int i; | ||
120 | |||
121 | sha2_final(desc); | ||
122 | |||
123 | for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) | ||
124 | put_unaligned_be32(sctx->state[i], dst++); | ||
125 | |||
126 | *sctx = (struct sha256_state){}; | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | static void sha2_finup(struct shash_desc *desc, const u8 *data, | ||
131 | unsigned int len) | ||
132 | { | ||
133 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
134 | int blocks; | ||
135 | |||
136 | if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { | ||
137 | sha2_update(desc, data, len); | ||
138 | sha2_final(desc); | ||
139 | return; | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Use a fast path if the input is a multiple of 64 bytes. In | ||
144 | * this case, there is no need to copy data around, and we can | ||
145 | * perform the entire digest calculation in a single invocation | ||
146 | * of sha2_ce_transform() | ||
147 | */ | ||
148 | blocks = len / SHA256_BLOCK_SIZE; | ||
149 | |||
150 | kernel_neon_begin_partial(28); | ||
151 | sha2_ce_transform(blocks, data, sctx->state, NULL, len); | ||
152 | kernel_neon_end(); | ||
153 | data += blocks * SHA256_BLOCK_SIZE; | ||
154 | } | ||
155 | |||
156 | static int sha224_finup(struct shash_desc *desc, const u8 *data, | ||
157 | unsigned int len, u8 *out) | ||
158 | { | ||
159 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
160 | __be32 *dst = (__be32 *)out; | ||
161 | int i; | ||
162 | |||
163 | sha2_finup(desc, data, len); | ||
164 | |||
165 | for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) | ||
166 | put_unaligned_be32(sctx->state[i], dst++); | ||
167 | |||
168 | *sctx = (struct sha256_state){}; | ||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | static int sha256_finup(struct shash_desc *desc, const u8 *data, | ||
173 | unsigned int len, u8 *out) | ||
174 | { | ||
175 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
176 | __be32 *dst = (__be32 *)out; | ||
177 | int i; | ||
178 | |||
179 | sha2_finup(desc, data, len); | ||
180 | |||
181 | for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) | ||
182 | put_unaligned_be32(sctx->state[i], dst++); | ||
183 | |||
184 | *sctx = (struct sha256_state){}; | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static int sha2_export(struct shash_desc *desc, void *out) | ||
189 | { | ||
190 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
191 | struct sha256_state *dst = out; | ||
192 | |||
193 | *dst = *sctx; | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static int sha2_import(struct shash_desc *desc, const void *in) | ||
198 | { | ||
199 | struct sha256_state *sctx = shash_desc_ctx(desc); | ||
200 | struct sha256_state const *src = in; | ||
201 | |||
202 | *sctx = *src; | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | static struct shash_alg algs[] = { { | ||
207 | .init = sha224_init, | ||
208 | .update = sha2_update, | ||
209 | .final = sha224_final, | ||
210 | .finup = sha224_finup, | ||
211 | .export = sha2_export, | ||
212 | .import = sha2_import, | ||
213 | .descsize = sizeof(struct sha256_state), | ||
214 | .digestsize = SHA224_DIGEST_SIZE, | ||
215 | .statesize = sizeof(struct sha256_state), | ||
216 | .base = { | ||
217 | .cra_name = "sha224", | ||
218 | .cra_driver_name = "sha224-ce", | ||
219 | .cra_priority = 200, | ||
220 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
221 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
222 | .cra_module = THIS_MODULE, | ||
223 | } | ||
224 | }, { | ||
225 | .init = sha256_init, | ||
226 | .update = sha2_update, | ||
227 | .final = sha256_final, | ||
228 | .finup = sha256_finup, | ||
229 | .export = sha2_export, | ||
230 | .import = sha2_import, | ||
231 | .descsize = sizeof(struct sha256_state), | ||
232 | .digestsize = SHA256_DIGEST_SIZE, | ||
233 | .statesize = sizeof(struct sha256_state), | ||
234 | .base = { | ||
235 | .cra_name = "sha256", | ||
236 | .cra_driver_name = "sha256-ce", | ||
237 | .cra_priority = 200, | ||
238 | .cra_flags = CRYPTO_ALG_TYPE_SHASH, | ||
239 | .cra_blocksize = SHA256_BLOCK_SIZE, | ||
240 | .cra_module = THIS_MODULE, | ||
241 | } | ||
242 | } }; | ||
243 | |||
244 | static int __init sha2_ce_mod_init(void) | ||
245 | { | ||
246 | return crypto_register_shashes(algs, ARRAY_SIZE(algs)); | ||
247 | } | ||
248 | |||
249 | static void __exit sha2_ce_mod_fini(void) | ||
250 | { | ||
251 | crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); | ||
252 | } | ||
253 | |||
254 | module_cpu_feature_match(SHA2, sha2_ce_mod_init); | ||
255 | module_exit(sha2_ce_mod_fini); | ||
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 83f71b3004a8..42c7eecd2bb6 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild | |||
@@ -40,6 +40,7 @@ generic-y += segment.h | |||
40 | generic-y += sembuf.h | 40 | generic-y += sembuf.h |
41 | generic-y += serial.h | 41 | generic-y += serial.h |
42 | generic-y += shmbuf.h | 42 | generic-y += shmbuf.h |
43 | generic-y += simd.h | ||
43 | generic-y += sizes.h | 44 | generic-y += sizes.h |
44 | generic-y += socket.h | 45 | generic-y += socket.h |
45 | generic-y += sockios.h | 46 | generic-y += sockios.h |
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index fd3e3924041b..5901480bfdca 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h | |||
@@ -21,6 +21,7 @@ | |||
21 | #endif | 21 | #endif |
22 | 22 | ||
23 | #include <asm/ptrace.h> | 23 | #include <asm/ptrace.h> |
24 | #include <asm/thread_info.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Stack pushing/popping (register pairs only). Equivalent to store decrement | 27 | * Stack pushing/popping (register pairs only). Equivalent to store decrement |
@@ -68,23 +69,31 @@ | |||
68 | msr daifclr, #8 | 69 | msr daifclr, #8 |
69 | .endm | 70 | .endm |
70 | 71 | ||
71 | .macro disable_step, tmp | 72 | .macro disable_step_tsk, flgs, tmp |
73 | tbz \flgs, #TIF_SINGLESTEP, 9990f | ||
72 | mrs \tmp, mdscr_el1 | 74 | mrs \tmp, mdscr_el1 |
73 | bic \tmp, \tmp, #1 | 75 | bic \tmp, \tmp, #1 |
74 | msr mdscr_el1, \tmp | 76 | msr mdscr_el1, \tmp |
77 | isb // Synchronise with enable_dbg | ||
78 | 9990: | ||
75 | .endm | 79 | .endm |
76 | 80 | ||
77 | .macro enable_step, tmp | 81 | .macro enable_step_tsk, flgs, tmp |
82 | tbz \flgs, #TIF_SINGLESTEP, 9990f | ||
83 | disable_dbg | ||
78 | mrs \tmp, mdscr_el1 | 84 | mrs \tmp, mdscr_el1 |
79 | orr \tmp, \tmp, #1 | 85 | orr \tmp, \tmp, #1 |
80 | msr mdscr_el1, \tmp | 86 | msr mdscr_el1, \tmp |
87 | 9990: | ||
81 | .endm | 88 | .endm |
82 | 89 | ||
83 | .macro enable_dbg_if_not_stepping, tmp | 90 | /* |
84 | mrs \tmp, mdscr_el1 | 91 | * Enable both debug exceptions and interrupts. This is likely to be |
85 | tbnz \tmp, #0, 9990f | 92 | * faster than two daifclr operations, since writes to this register |
86 | enable_dbg | 93 | * are self-synchronising. |
87 | 9990: | 94 | */ |
95 | .macro enable_dbg_and_irq | ||
96 | msr daifclr, #(8 | 2) | ||
88 | .endm | 97 | .endm |
89 | 98 | ||
90 | /* | 99 | /* |
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index 57e8cb49824c..65f1569ac96e 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h | |||
@@ -157,7 +157,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) | |||
157 | */ | 157 | */ |
158 | #define ATOMIC64_INIT(i) { (i) } | 158 | #define ATOMIC64_INIT(i) { (i) } |
159 | 159 | ||
160 | #define atomic64_read(v) (*(volatile long long *)&(v)->counter) | 160 | #define atomic64_read(v) (*(volatile long *)&(v)->counter) |
161 | #define atomic64_set(v,i) (((v)->counter) = (i)) | 161 | #define atomic64_set(v,i) (((v)->counter) = (i)) |
162 | 162 | ||
163 | static inline void atomic64_add(u64 i, atomic64_t *v) | 163 | static inline void atomic64_add(u64 i, atomic64_t *v) |
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 48b9e704af7c..6389d60574d9 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h | |||
@@ -25,12 +25,12 @@ | |||
25 | #define wfi() asm volatile("wfi" : : : "memory") | 25 | #define wfi() asm volatile("wfi" : : : "memory") |
26 | 26 | ||
27 | #define isb() asm volatile("isb" : : : "memory") | 27 | #define isb() asm volatile("isb" : : : "memory") |
28 | #define dmb(opt) asm volatile("dmb sy" : : : "memory") | 28 | #define dmb(opt) asm volatile("dmb " #opt : : : "memory") |
29 | #define dsb(opt) asm volatile("dsb sy" : : : "memory") | 29 | #define dsb(opt) asm volatile("dsb " #opt : : : "memory") |
30 | 30 | ||
31 | #define mb() dsb() | 31 | #define mb() dsb(sy) |
32 | #define rmb() asm volatile("dsb ld" : : : "memory") | 32 | #define rmb() dsb(ld) |
33 | #define wmb() asm volatile("dsb st" : : : "memory") | 33 | #define wmb() dsb(st) |
34 | 34 | ||
35 | #ifndef CONFIG_SMP | 35 | #ifndef CONFIG_SMP |
36 | #define smp_mb() barrier() | 36 | #define smp_mb() barrier() |
@@ -40,7 +40,7 @@ | |||
40 | #define smp_store_release(p, v) \ | 40 | #define smp_store_release(p, v) \ |
41 | do { \ | 41 | do { \ |
42 | compiletime_assert_atomic_type(*p); \ | 42 | compiletime_assert_atomic_type(*p); \ |
43 | smp_mb(); \ | 43 | barrier(); \ |
44 | ACCESS_ONCE(*p) = (v); \ | 44 | ACCESS_ONCE(*p) = (v); \ |
45 | } while (0) | 45 | } while (0) |
46 | 46 | ||
@@ -48,15 +48,15 @@ do { \ | |||
48 | ({ \ | 48 | ({ \ |
49 | typeof(*p) ___p1 = ACCESS_ONCE(*p); \ | 49 | typeof(*p) ___p1 = ACCESS_ONCE(*p); \ |
50 | compiletime_assert_atomic_type(*p); \ | 50 | compiletime_assert_atomic_type(*p); \ |
51 | smp_mb(); \ | 51 | barrier(); \ |
52 | ___p1; \ | 52 | ___p1; \ |
53 | }) | 53 | }) |
54 | 54 | ||
55 | #else | 55 | #else |
56 | 56 | ||
57 | #define smp_mb() asm volatile("dmb ish" : : : "memory") | 57 | #define smp_mb() dmb(ish) |
58 | #define smp_rmb() asm volatile("dmb ishld" : : : "memory") | 58 | #define smp_rmb() dmb(ishld) |
59 | #define smp_wmb() asm volatile("dmb ishst" : : : "memory") | 59 | #define smp_wmb() dmb(ishst) |
60 | 60 | ||
61 | #define smp_store_release(p, v) \ | 61 | #define smp_store_release(p, v) \ |
62 | do { \ | 62 | do { \ |
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 390308a67f0d..88cc05b5f3ac 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h | |||
@@ -16,6 +16,8 @@ | |||
16 | #ifndef __ASM_CACHE_H | 16 | #ifndef __ASM_CACHE_H |
17 | #define __ASM_CACHE_H | 17 | #define __ASM_CACHE_H |
18 | 18 | ||
19 | #include <asm/cachetype.h> | ||
20 | |||
19 | #define L1_CACHE_SHIFT 6 | 21 | #define L1_CACHE_SHIFT 6 |
20 | #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) | 22 | #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) |
21 | 23 | ||
@@ -27,6 +29,15 @@ | |||
27 | * the CPU. | 29 | * the CPU. |
28 | */ | 30 | */ |
29 | #define ARCH_DMA_MINALIGN L1_CACHE_BYTES | 31 | #define ARCH_DMA_MINALIGN L1_CACHE_BYTES |
30 | #define ARCH_SLAB_MINALIGN 8 | 32 | |
33 | #ifndef __ASSEMBLY__ | ||
34 | |||
35 | static inline int cache_line_size(void) | ||
36 | { | ||
37 | u32 cwg = cache_type_cwg(); | ||
38 | return cwg ? 4 << cwg : L1_CACHE_BYTES; | ||
39 | } | ||
40 | |||
41 | #endif /* __ASSEMBLY__ */ | ||
31 | 42 | ||
32 | #endif | 43 | #endif |
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 4c60e64a801c..a5176cf32dad 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h | |||
@@ -123,7 +123,7 @@ extern void flush_dcache_page(struct page *); | |||
123 | static inline void __flush_icache_all(void) | 123 | static inline void __flush_icache_all(void) |
124 | { | 124 | { |
125 | asm("ic ialluis"); | 125 | asm("ic ialluis"); |
126 | dsb(); | 126 | dsb(ish); |
127 | } | 127 | } |
128 | 128 | ||
129 | #define flush_dcache_mmap_lock(mapping) \ | 129 | #define flush_dcache_mmap_lock(mapping) \ |
@@ -150,7 +150,7 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) | |||
150 | * set_pte_at() called from vmap_pte_range() does not | 150 | * set_pte_at() called from vmap_pte_range() does not |
151 | * have a DSB after cleaning the cache line. | 151 | * have a DSB after cleaning the cache line. |
152 | */ | 152 | */ |
153 | dsb(); | 153 | dsb(ish); |
154 | } | 154 | } |
155 | 155 | ||
156 | static inline void flush_cache_vunmap(unsigned long start, unsigned long end) | 156 | static inline void flush_cache_vunmap(unsigned long start, unsigned long end) |
diff --git a/arch/arm64/include/asm/cachetype.h b/arch/arm64/include/asm/cachetype.h index 85f5f511352a..4b23e758d5e0 100644 --- a/arch/arm64/include/asm/cachetype.h +++ b/arch/arm64/include/asm/cachetype.h | |||
@@ -20,12 +20,16 @@ | |||
20 | 20 | ||
21 | #define CTR_L1IP_SHIFT 14 | 21 | #define CTR_L1IP_SHIFT 14 |
22 | #define CTR_L1IP_MASK 3 | 22 | #define CTR_L1IP_MASK 3 |
23 | #define CTR_CWG_SHIFT 24 | ||
24 | #define CTR_CWG_MASK 15 | ||
23 | 25 | ||
24 | #define ICACHE_POLICY_RESERVED 0 | 26 | #define ICACHE_POLICY_RESERVED 0 |
25 | #define ICACHE_POLICY_AIVIVT 1 | 27 | #define ICACHE_POLICY_AIVIVT 1 |
26 | #define ICACHE_POLICY_VIPT 2 | 28 | #define ICACHE_POLICY_VIPT 2 |
27 | #define ICACHE_POLICY_PIPT 3 | 29 | #define ICACHE_POLICY_PIPT 3 |
28 | 30 | ||
31 | #ifndef __ASSEMBLY__ | ||
32 | |||
29 | static inline u32 icache_policy(void) | 33 | static inline u32 icache_policy(void) |
30 | { | 34 | { |
31 | return (read_cpuid_cachetype() >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK; | 35 | return (read_cpuid_cachetype() >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK; |
@@ -45,4 +49,11 @@ static inline int icache_is_aivivt(void) | |||
45 | return icache_policy() == ICACHE_POLICY_AIVIVT; | 49 | return icache_policy() == ICACHE_POLICY_AIVIVT; |
46 | } | 50 | } |
47 | 51 | ||
52 | static inline u32 cache_type_cwg(void) | ||
53 | { | ||
54 | return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK; | ||
55 | } | ||
56 | |||
57 | #endif /* __ASSEMBLY__ */ | ||
58 | |||
48 | #endif /* __ASM_CACHETYPE_H */ | 59 | #endif /* __ASM_CACHETYPE_H */ |
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index 57c0fa7bf711..ddb9d7830558 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h | |||
@@ -72,7 +72,12 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size | |||
72 | } | 72 | } |
73 | 73 | ||
74 | #define xchg(ptr,x) \ | 74 | #define xchg(ptr,x) \ |
75 | ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) | 75 | ({ \ |
76 | __typeof__(*(ptr)) __ret; \ | ||
77 | __ret = (__typeof__(*(ptr))) \ | ||
78 | __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \ | ||
79 | __ret; \ | ||
80 | }) | ||
76 | 81 | ||
77 | static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, | 82 | static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, |
78 | unsigned long new, int size) | 83 | unsigned long new, int size) |
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index e71f81fe127a..253e33bc94fb 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h | |||
@@ -305,11 +305,6 @@ static inline int is_compat_thread(struct thread_info *thread) | |||
305 | 305 | ||
306 | #else /* !CONFIG_COMPAT */ | 306 | #else /* !CONFIG_COMPAT */ |
307 | 307 | ||
308 | static inline int is_compat_task(void) | ||
309 | { | ||
310 | return 0; | ||
311 | } | ||
312 | |||
313 | static inline int is_compat_thread(struct thread_info *thread) | 308 | static inline int is_compat_thread(struct thread_info *thread) |
314 | { | 309 | { |
315 | return 0; | 310 | return 0; |
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index c4a7f940b387..72674f4c3871 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h | |||
@@ -18,9 +18,11 @@ | |||
18 | #ifndef __ASM_ESR_H | 18 | #ifndef __ASM_ESR_H |
19 | #define __ASM_ESR_H | 19 | #define __ASM_ESR_H |
20 | 20 | ||
21 | #define ESR_EL1_EC_SHIFT (26) | 21 | #define ESR_EL1_WRITE (1 << 6) |
22 | #define ESR_EL1_IL (1U << 25) | 22 | #define ESR_EL1_CM (1 << 8) |
23 | #define ESR_EL1_IL (1 << 25) | ||
23 | 24 | ||
25 | #define ESR_EL1_EC_SHIFT (26) | ||
24 | #define ESR_EL1_EC_UNKNOWN (0x00) | 26 | #define ESR_EL1_EC_UNKNOWN (0x00) |
25 | #define ESR_EL1_EC_WFI (0x01) | 27 | #define ESR_EL1_EC_WFI (0x01) |
26 | #define ESR_EL1_EC_CP15_32 (0x03) | 28 | #define ESR_EL1_EC_CP15_32 (0x03) |
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index c43b4ac13008..50f559f574fe 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h | |||
@@ -37,8 +37,21 @@ struct fpsimd_state { | |||
37 | u32 fpcr; | 37 | u32 fpcr; |
38 | }; | 38 | }; |
39 | }; | 39 | }; |
40 | /* the id of the last cpu to have restored this state */ | ||
41 | unsigned int cpu; | ||
40 | }; | 42 | }; |
41 | 43 | ||
44 | /* | ||
45 | * Struct for stacking the bottom 'n' FP/SIMD registers. | ||
46 | */ | ||
47 | struct fpsimd_partial_state { | ||
48 | u32 fpsr; | ||
49 | u32 fpcr; | ||
50 | u32 num_regs; | ||
51 | __uint128_t vregs[32]; | ||
52 | }; | ||
53 | |||
54 | |||
42 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) | 55 | #if defined(__KERNEL__) && defined(CONFIG_COMPAT) |
43 | /* Masks for extracting the FPSR and FPCR from the FPSCR */ | 56 | /* Masks for extracting the FPSR and FPCR from the FPSCR */ |
44 | #define VFP_FPSCR_STAT_MASK 0xf800009f | 57 | #define VFP_FPSCR_STAT_MASK 0xf800009f |
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state); | |||
58 | extern void fpsimd_thread_switch(struct task_struct *next); | 71 | extern void fpsimd_thread_switch(struct task_struct *next); |
59 | extern void fpsimd_flush_thread(void); | 72 | extern void fpsimd_flush_thread(void); |
60 | 73 | ||
74 | extern void fpsimd_preserve_current_state(void); | ||
75 | extern void fpsimd_restore_current_state(void); | ||
76 | extern void fpsimd_update_current_state(struct fpsimd_state *state); | ||
77 | |||
78 | extern void fpsimd_flush_task_state(struct task_struct *target); | ||
79 | |||
80 | extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, | ||
81 | u32 num_regs); | ||
82 | extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); | ||
83 | |||
61 | #endif | 84 | #endif |
62 | 85 | ||
63 | #endif | 86 | #endif |
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index bbec599c96bd..768414d55e64 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h | |||
@@ -62,3 +62,38 @@ | |||
62 | ldr w\tmpnr, [\state, #16 * 2 + 4] | 62 | ldr w\tmpnr, [\state, #16 * 2 + 4] |
63 | msr fpcr, x\tmpnr | 63 | msr fpcr, x\tmpnr |
64 | .endm | 64 | .endm |
65 | |||
66 | .altmacro | ||
67 | .macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 | ||
68 | mrs x\tmpnr1, fpsr | ||
69 | str w\numnr, [\state, #8] | ||
70 | mrs x\tmpnr2, fpcr | ||
71 | stp w\tmpnr1, w\tmpnr2, [\state] | ||
72 | adr x\tmpnr1, 0f | ||
73 | add \state, \state, x\numnr, lsl #4 | ||
74 | sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 | ||
75 | br x\tmpnr1 | ||
76 | .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 | ||
77 | .irp qb, %(qa + 1) | ||
78 | stp q\qa, q\qb, [\state, # -16 * \qa - 16] | ||
79 | .endr | ||
80 | .endr | ||
81 | 0: | ||
82 | .endm | ||
83 | |||
84 | .macro fpsimd_restore_partial state, tmpnr1, tmpnr2 | ||
85 | ldp w\tmpnr1, w\tmpnr2, [\state] | ||
86 | msr fpsr, x\tmpnr1 | ||
87 | msr fpcr, x\tmpnr2 | ||
88 | adr x\tmpnr1, 0f | ||
89 | ldr w\tmpnr2, [\state, #8] | ||
90 | add \state, \state, x\tmpnr2, lsl #4 | ||
91 | sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 | ||
92 | br x\tmpnr1 | ||
93 | .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 | ||
94 | .irp qb, %(qa + 1) | ||
95 | ldp q\qa, q\qb, [\state, # -16 * \qa - 16] | ||
96 | .endr | ||
97 | .endr | ||
98 | 0: | ||
99 | .endm | ||
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h new file mode 100644 index 000000000000..c5534facf941 --- /dev/null +++ b/arch/arm64/include/asm/ftrace.h | |||
@@ -0,0 +1,59 @@ | |||
1 | /* | ||
2 | * arch/arm64/include/asm/ftrace.h | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Limited | ||
5 | * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | #ifndef __ASM_FTRACE_H | ||
12 | #define __ASM_FTRACE_H | ||
13 | |||
14 | #include <asm/insn.h> | ||
15 | |||
16 | #define MCOUNT_ADDR ((unsigned long)_mcount) | ||
17 | #define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE | ||
18 | |||
19 | #ifndef __ASSEMBLY__ | ||
20 | #include <linux/compat.h> | ||
21 | |||
22 | extern void _mcount(unsigned long); | ||
23 | extern void *return_address(unsigned int); | ||
24 | |||
25 | struct dyn_arch_ftrace { | ||
26 | /* No extra data needed for arm64 */ | ||
27 | }; | ||
28 | |||
29 | extern unsigned long ftrace_graph_call; | ||
30 | |||
31 | static inline unsigned long ftrace_call_adjust(unsigned long addr) | ||
32 | { | ||
33 | /* | ||
34 | * addr is the address of the mcount call instruction. | ||
35 | * recordmcount does the necessary offset calculation. | ||
36 | */ | ||
37 | return addr; | ||
38 | } | ||
39 | |||
40 | #define ftrace_return_address(n) return_address(n) | ||
41 | |||
42 | /* | ||
43 | * Because AArch32 mode does not share the same syscall table with AArch64, | ||
44 | * tracing compat syscalls may result in reporting bogus syscalls or even | ||
45 | * hang-up, so just do not trace them. | ||
46 | * See kernel/trace/trace_syscalls.c | ||
47 | * | ||
48 | * x86 code says: | ||
49 | * If the user realy wants these, then they should use the | ||
50 | * raw syscall tracepoints with filtering. | ||
51 | */ | ||
52 | #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS | ||
53 | static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) | ||
54 | { | ||
55 | return is_compat_task(); | ||
56 | } | ||
57 | #endif /* ifndef __ASSEMBLY__ */ | ||
58 | |||
59 | #endif /* __ASM_FTRACE_H */ | ||
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index ae4801d77514..0be67821f9ce 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h | |||
@@ -20,7 +20,7 @@ | |||
20 | #include <linux/threads.h> | 20 | #include <linux/threads.h> |
21 | #include <asm/irq.h> | 21 | #include <asm/irq.h> |
22 | 22 | ||
23 | #define NR_IPI 5 | 23 | #define NR_IPI 6 |
24 | 24 | ||
25 | typedef struct { | 25 | typedef struct { |
26 | unsigned int __softirq_pending; | 26 | unsigned int __softirq_pending; |
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index c44ad39ed310..dc1f73b13e74 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h | |||
@@ -21,6 +21,7 @@ | |||
21 | /* A64 instructions are always 32 bits. */ | 21 | /* A64 instructions are always 32 bits. */ |
22 | #define AARCH64_INSN_SIZE 4 | 22 | #define AARCH64_INSN_SIZE 4 |
23 | 23 | ||
24 | #ifndef __ASSEMBLY__ | ||
24 | /* | 25 | /* |
25 | * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a | 26 | * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a |
26 | * Section C3.1 "A64 instruction index by encoding": | 27 | * Section C3.1 "A64 instruction index by encoding": |
@@ -104,5 +105,6 @@ bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn); | |||
104 | int aarch64_insn_patch_text_nosync(void *addr, u32 insn); | 105 | int aarch64_insn_patch_text_nosync(void *addr, u32 insn); |
105 | int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); | 106 | int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); |
106 | int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); | 107 | int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); |
108 | #endif /* __ASSEMBLY__ */ | ||
107 | 109 | ||
108 | #endif /* __ASM_INSN_H */ | 110 | #endif /* __ASM_INSN_H */ |
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index a1bef78f0303..e0ecdcf6632d 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h | |||
@@ -230,19 +230,11 @@ extern void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot | |||
230 | extern void __iounmap(volatile void __iomem *addr); | 230 | extern void __iounmap(volatile void __iomem *addr); |
231 | extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); | 231 | extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); |
232 | 232 | ||
233 | #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_DIRTY) | ||
234 | #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) | ||
235 | #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL_NC)) | ||
236 | #define PROT_NORMAL (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) | ||
237 | |||
238 | #define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) | 233 | #define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) |
239 | #define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) | 234 | #define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) |
240 | #define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC)) | 235 | #define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC)) |
241 | #define iounmap __iounmap | 236 | #define iounmap __iounmap |
242 | 237 | ||
243 | #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF) | ||
244 | #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PTE_PXN | PTE_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) | ||
245 | |||
246 | #define ARCH_HAS_IOREMAP_WC | 238 | #define ARCH_HAS_IOREMAP_WC |
247 | #include <asm-generic/iomap.h> | 239 | #include <asm-generic/iomap.h> |
248 | 240 | ||
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index b0cc58a97780..13ce4cc18e26 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h | |||
@@ -8,7 +8,11 @@ | |||
8 | * published by the Free Software Foundation. | 8 | * published by the Free Software Foundation. |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/types.h> | ||
12 | |||
11 | #define cpu_has_neon() (1) | 13 | #define cpu_has_neon() (1) |
12 | 14 | ||
13 | void kernel_neon_begin(void); | 15 | #define kernel_neon_begin() kernel_neon_begin_partial(32) |
16 | |||
17 | void kernel_neon_begin_partial(u32 num_regs); | ||
14 | void kernel_neon_end(void); | 18 | void kernel_neon_end(void); |
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5fc8a66c3924..955e8c5f0afb 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h | |||
@@ -29,6 +29,8 @@ | |||
29 | */ | 29 | */ |
30 | 30 | ||
31 | #define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1) | 31 | #define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1) |
32 | #define PUD_TYPE_MASK (_AT(pgdval_t, 3) << 0) | ||
33 | #define PUD_TYPE_SECT (_AT(pgdval_t, 1) << 0) | ||
32 | 34 | ||
33 | /* | 35 | /* |
34 | * Level 2 descriptor (PMD). | 36 | * Level 2 descriptor (PMD). |
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e2f96748859b..598cc384fc1c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
@@ -52,66 +52,59 @@ extern void __pgd_error(const char *file, int line, unsigned long val); | |||
52 | #endif | 52 | #endif |
53 | #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) | 53 | #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) |
54 | 54 | ||
55 | /* | 55 | #ifdef CONFIG_SMP |
56 | * The pgprot_* and protection_map entries will be fixed up at runtime to | 56 | #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) |
57 | * include the cachable and bufferable bits based on memory policy, as well as | 57 | #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) |
58 | * any architecture dependent bits like global/ASID and SMP shared mapping | 58 | #else |
59 | * bits. | 59 | #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF) |
60 | */ | 60 | #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF) |
61 | #define _PAGE_DEFAULT PTE_TYPE_PAGE | PTE_AF | 61 | #endif |
62 | 62 | ||
63 | extern pgprot_t pgprot_default; | 63 | #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) |
64 | #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC)) | ||
65 | #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL)) | ||
64 | 66 | ||
65 | #define __pgprot_modify(prot,mask,bits) \ | 67 | #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) |
66 | __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) | 68 | #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) |
69 | #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) | ||
67 | 70 | ||
68 | #define _MOD_PROT(p, b) __pgprot_modify(p, 0, b) | 71 | #define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) |
69 | 72 | ||
70 | #define PAGE_NONE __pgprot_modify(pgprot_default, PTE_TYPE_MASK, PTE_PROT_NONE | PTE_PXN | PTE_UXN) | 73 | #define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) |
71 | #define PAGE_SHARED _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) | 74 | #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) |
72 | #define PAGE_SHARED_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) | ||
73 | #define PAGE_COPY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) | ||
74 | #define PAGE_COPY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN) | ||
75 | #define PAGE_READONLY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) | ||
76 | #define PAGE_READONLY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN) | ||
77 | #define PAGE_KERNEL _MOD_PROT(pgprot_default, PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) | ||
78 | #define PAGE_KERNEL_EXEC _MOD_PROT(pgprot_default, PTE_UXN | PTE_DIRTY | PTE_WRITE) | ||
79 | 75 | ||
80 | #define PAGE_HYP _MOD_PROT(pgprot_default, PTE_HYP) | 76 | #define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) |
81 | #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) | 77 | #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) |
82 | 78 | ||
83 | #define PAGE_S2 __pgprot_modify(pgprot_default, PTE_S2_MEMATTR_MASK, PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) | 79 | #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) |
84 | #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN) | 80 | #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN) |
85 | 81 | ||
86 | #define __PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) | 82 | #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) |
87 | #define __PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) | 83 | #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) |
88 | #define __PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) | 84 | #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) |
89 | #define __PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) | 85 | #define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) |
90 | #define __PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) | 86 | #define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) |
91 | #define __PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) | 87 | #define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) |
92 | #define __PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) | 88 | #define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) |
93 | 89 | ||
94 | #endif /* __ASSEMBLY__ */ | 90 | #define __P000 PAGE_NONE |
95 | 91 | #define __P001 PAGE_READONLY | |
96 | #define __P000 __PAGE_NONE | 92 | #define __P010 PAGE_COPY |
97 | #define __P001 __PAGE_READONLY | 93 | #define __P011 PAGE_COPY |
98 | #define __P010 __PAGE_COPY | 94 | #define __P100 PAGE_READONLY_EXEC |
99 | #define __P011 __PAGE_COPY | 95 | #define __P101 PAGE_READONLY_EXEC |
100 | #define __P100 __PAGE_READONLY_EXEC | 96 | #define __P110 PAGE_COPY_EXEC |
101 | #define __P101 __PAGE_READONLY_EXEC | 97 | #define __P111 PAGE_COPY_EXEC |
102 | #define __P110 __PAGE_COPY_EXEC | 98 | |
103 | #define __P111 __PAGE_COPY_EXEC | 99 | #define __S000 PAGE_NONE |
104 | 100 | #define __S001 PAGE_READONLY | |
105 | #define __S000 __PAGE_NONE | 101 | #define __S010 PAGE_SHARED |
106 | #define __S001 __PAGE_READONLY | 102 | #define __S011 PAGE_SHARED |
107 | #define __S010 __PAGE_SHARED | 103 | #define __S100 PAGE_READONLY_EXEC |
108 | #define __S011 __PAGE_SHARED | 104 | #define __S101 PAGE_READONLY_EXEC |
109 | #define __S100 __PAGE_READONLY_EXEC | 105 | #define __S110 PAGE_SHARED_EXEC |
110 | #define __S101 __PAGE_READONLY_EXEC | 106 | #define __S111 PAGE_SHARED_EXEC |
111 | #define __S110 __PAGE_SHARED_EXEC | ||
112 | #define __S111 __PAGE_SHARED_EXEC | ||
113 | 107 | ||
114 | #ifndef __ASSEMBLY__ | ||
115 | /* | 108 | /* |
116 | * ZERO_PAGE is a global shared page that is always zero: used | 109 | * ZERO_PAGE is a global shared page that is always zero: used |
117 | * for zero-mapped memory areas etc.. | 110 | * for zero-mapped memory areas etc.. |
@@ -265,6 +258,7 @@ static inline pmd_t pte_pmd(pte_t pte) | |||
265 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) | 258 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) |
266 | 259 | ||
267 | #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) | 260 | #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) |
261 | #define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) | ||
268 | 262 | ||
269 | #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) | 263 | #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) |
270 | 264 | ||
@@ -273,6 +267,9 @@ static inline int has_transparent_hugepage(void) | |||
273 | return 1; | 267 | return 1; |
274 | } | 268 | } |
275 | 269 | ||
270 | #define __pgprot_modify(prot,mask,bits) \ | ||
271 | __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) | ||
272 | |||
276 | /* | 273 | /* |
277 | * Mark the prot value as uncacheable and unbufferable. | 274 | * Mark the prot value as uncacheable and unbufferable. |
278 | */ | 275 | */ |
@@ -295,11 +292,17 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | |||
295 | #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ | 292 | #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ |
296 | PMD_TYPE_SECT) | 293 | PMD_TYPE_SECT) |
297 | 294 | ||
295 | #ifdef ARM64_64K_PAGES | ||
296 | #define pud_sect(pud) (0) | ||
297 | #else | ||
298 | #define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ | ||
299 | PUD_TYPE_SECT) | ||
300 | #endif | ||
298 | 301 | ||
299 | static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) | 302 | static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) |
300 | { | 303 | { |
301 | *pmdp = pmd; | 304 | *pmdp = pmd; |
302 | dsb(); | 305 | dsb(ishst); |
303 | } | 306 | } |
304 | 307 | ||
305 | static inline void pmd_clear(pmd_t *pmdp) | 308 | static inline void pmd_clear(pmd_t *pmdp) |
@@ -329,7 +332,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) | |||
329 | static inline void set_pud(pud_t *pudp, pud_t pud) | 332 | static inline void set_pud(pud_t *pudp, pud_t pud) |
330 | { | 333 | { |
331 | *pudp = pud; | 334 | *pudp = pud; |
332 | dsb(); | 335 | dsb(ishst); |
333 | } | 336 | } |
334 | 337 | ||
335 | static inline void pud_clear(pud_t *pudp) | 338 | static inline void pud_clear(pud_t *pudp) |
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 45b20cd6cbca..34de2a8f7d93 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h | |||
@@ -79,6 +79,7 @@ struct thread_struct { | |||
79 | unsigned long tp_value; | 79 | unsigned long tp_value; |
80 | struct fpsimd_state fpsimd_state; | 80 | struct fpsimd_state fpsimd_state; |
81 | unsigned long fault_address; /* fault info */ | 81 | unsigned long fault_address; /* fault info */ |
82 | unsigned long fault_code; /* ESR_EL1 value */ | ||
82 | struct debug_info debug; /* debugging */ | 83 | struct debug_info debug; /* debugging */ |
83 | }; | 84 | }; |
84 | 85 | ||
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index c7ba261dd4b3..a429b5940be2 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h | |||
@@ -135,6 +135,11 @@ struct pt_regs { | |||
135 | #define user_stack_pointer(regs) \ | 135 | #define user_stack_pointer(regs) \ |
136 | (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp) | 136 | (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp) |
137 | 137 | ||
138 | static inline unsigned long regs_return_value(struct pt_regs *regs) | ||
139 | { | ||
140 | return regs->regs[0]; | ||
141 | } | ||
142 | |||
138 | /* | 143 | /* |
139 | * Are the current registers suitable for user mode? (used to maintain | 144 | * Are the current registers suitable for user mode? (used to maintain |
140 | * security in signal handlers) | 145 | * security in signal handlers) |
diff --git a/arch/arm64/include/asm/sigcontext.h b/arch/arm64/include/asm/sigcontext.h deleted file mode 100644 index dca1094acc74..000000000000 --- a/arch/arm64/include/asm/sigcontext.h +++ /dev/null | |||
@@ -1,31 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 ARM Ltd. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
15 | */ | ||
16 | #ifndef __ASM_SIGCONTEXT_H | ||
17 | #define __ASM_SIGCONTEXT_H | ||
18 | |||
19 | #include <uapi/asm/sigcontext.h> | ||
20 | |||
21 | /* | ||
22 | * Auxiliary context saved in the sigcontext.__reserved array. Not exported to | ||
23 | * user space as it will change with the addition of new context. User space | ||
24 | * should check the magic/size information. | ||
25 | */ | ||
26 | struct aux_context { | ||
27 | struct fpsimd_context fpsimd; | ||
28 | /* additional context to be added before "end" */ | ||
29 | struct _aarch64_ctx end; | ||
30 | }; | ||
31 | #endif | ||
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3ee8b303d9a9..64d2d4884a9d 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h | |||
@@ -22,6 +22,18 @@ extern char *strrchr(const char *, int c); | |||
22 | #define __HAVE_ARCH_STRCHR | 22 | #define __HAVE_ARCH_STRCHR |
23 | extern char *strchr(const char *, int c); | 23 | extern char *strchr(const char *, int c); |
24 | 24 | ||
25 | #define __HAVE_ARCH_STRCMP | ||
26 | extern int strcmp(const char *, const char *); | ||
27 | |||
28 | #define __HAVE_ARCH_STRNCMP | ||
29 | extern int strncmp(const char *, const char *, __kernel_size_t); | ||
30 | |||
31 | #define __HAVE_ARCH_STRLEN | ||
32 | extern __kernel_size_t strlen(const char *); | ||
33 | |||
34 | #define __HAVE_ARCH_STRNLEN | ||
35 | extern __kernel_size_t strnlen(const char *, __kernel_size_t); | ||
36 | |||
25 | #define __HAVE_ARCH_MEMCPY | 37 | #define __HAVE_ARCH_MEMCPY |
26 | extern void *memcpy(void *, const void *, __kernel_size_t); | 38 | extern void *memcpy(void *, const void *, __kernel_size_t); |
27 | 39 | ||
@@ -34,4 +46,7 @@ extern void *memchr(const void *, int, __kernel_size_t); | |||
34 | #define __HAVE_ARCH_MEMSET | 46 | #define __HAVE_ARCH_MEMSET |
35 | extern void *memset(void *, int, __kernel_size_t); | 47 | extern void *memset(void *, int, __kernel_size_t); |
36 | 48 | ||
49 | #define __HAVE_ARCH_MEMCMP | ||
50 | extern int memcmp(const void *, const void *, size_t); | ||
51 | |||
37 | #endif | 52 | #endif |
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index 70ba9d4ee978..383771eb0b87 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | #include <linux/err.h> | 19 | #include <linux/err.h> |
20 | 20 | ||
21 | extern const void *sys_call_table[]; | ||
21 | 22 | ||
22 | static inline int syscall_get_nr(struct task_struct *task, | 23 | static inline int syscall_get_nr(struct task_struct *task, |
23 | struct pt_regs *regs) | 24 | struct pt_regs *regs) |
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 7b8e3a2a00fb..e40b6d06d515 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h | |||
@@ -91,6 +91,9 @@ static inline struct thread_info *current_thread_info(void) | |||
91 | /* | 91 | /* |
92 | * thread information flags: | 92 | * thread information flags: |
93 | * TIF_SYSCALL_TRACE - syscall trace active | 93 | * TIF_SYSCALL_TRACE - syscall trace active |
94 | * TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace | ||
95 | * TIF_SYSCALL_AUDIT - syscall auditing | ||
96 | * TIF_SECOMP - syscall secure computing | ||
94 | * TIF_SIGPENDING - signal pending | 97 | * TIF_SIGPENDING - signal pending |
95 | * TIF_NEED_RESCHED - rescheduling necessary | 98 | * TIF_NEED_RESCHED - rescheduling necessary |
96 | * TIF_NOTIFY_RESUME - callback before returning to user | 99 | * TIF_NOTIFY_RESUME - callback before returning to user |
@@ -99,7 +102,11 @@ static inline struct thread_info *current_thread_info(void) | |||
99 | #define TIF_SIGPENDING 0 | 102 | #define TIF_SIGPENDING 0 |
100 | #define TIF_NEED_RESCHED 1 | 103 | #define TIF_NEED_RESCHED 1 |
101 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ | 104 | #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ |
105 | #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ | ||
102 | #define TIF_SYSCALL_TRACE 8 | 106 | #define TIF_SYSCALL_TRACE 8 |
107 | #define TIF_SYSCALL_AUDIT 9 | ||
108 | #define TIF_SYSCALL_TRACEPOINT 10 | ||
109 | #define TIF_SECCOMP 11 | ||
103 | #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ | 110 | #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ |
104 | #define TIF_FREEZE 19 | 111 | #define TIF_FREEZE 19 |
105 | #define TIF_RESTORE_SIGMASK 20 | 112 | #define TIF_RESTORE_SIGMASK 20 |
@@ -110,10 +117,18 @@ static inline struct thread_info *current_thread_info(void) | |||
110 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) | 117 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) |
111 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) | 118 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) |
112 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | 119 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) |
120 | #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) | ||
121 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | ||
122 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | ||
123 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) | ||
124 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) | ||
113 | #define _TIF_32BIT (1 << TIF_32BIT) | 125 | #define _TIF_32BIT (1 << TIF_32BIT) |
114 | 126 | ||
115 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ | 127 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ |
116 | _TIF_NOTIFY_RESUME) | 128 | _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) |
129 | |||
130 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ | ||
131 | _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) | ||
117 | 132 | ||
118 | #endif /* __KERNEL__ */ | 133 | #endif /* __KERNEL__ */ |
119 | #endif /* __ASM_THREAD_INFO_H */ | 134 | #endif /* __ASM_THREAD_INFO_H */ |
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 8b482035cfc2..b9349c4513ea 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h | |||
@@ -72,9 +72,9 @@ extern struct cpu_tlb_fns cpu_tlb; | |||
72 | */ | 72 | */ |
73 | static inline void flush_tlb_all(void) | 73 | static inline void flush_tlb_all(void) |
74 | { | 74 | { |
75 | dsb(); | 75 | dsb(ishst); |
76 | asm("tlbi vmalle1is"); | 76 | asm("tlbi vmalle1is"); |
77 | dsb(); | 77 | dsb(ish); |
78 | isb(); | 78 | isb(); |
79 | } | 79 | } |
80 | 80 | ||
@@ -82,9 +82,9 @@ static inline void flush_tlb_mm(struct mm_struct *mm) | |||
82 | { | 82 | { |
83 | unsigned long asid = (unsigned long)ASID(mm) << 48; | 83 | unsigned long asid = (unsigned long)ASID(mm) << 48; |
84 | 84 | ||
85 | dsb(); | 85 | dsb(ishst); |
86 | asm("tlbi aside1is, %0" : : "r" (asid)); | 86 | asm("tlbi aside1is, %0" : : "r" (asid)); |
87 | dsb(); | 87 | dsb(ish); |
88 | } | 88 | } |
89 | 89 | ||
90 | static inline void flush_tlb_page(struct vm_area_struct *vma, | 90 | static inline void flush_tlb_page(struct vm_area_struct *vma, |
@@ -93,16 +93,36 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, | |||
93 | unsigned long addr = uaddr >> 12 | | 93 | unsigned long addr = uaddr >> 12 | |
94 | ((unsigned long)ASID(vma->vm_mm) << 48); | 94 | ((unsigned long)ASID(vma->vm_mm) << 48); |
95 | 95 | ||
96 | dsb(); | 96 | dsb(ishst); |
97 | asm("tlbi vae1is, %0" : : "r" (addr)); | 97 | asm("tlbi vae1is, %0" : : "r" (addr)); |
98 | dsb(); | 98 | dsb(ish); |
99 | } | 99 | } |
100 | 100 | ||
101 | /* | 101 | static inline void flush_tlb_range(struct vm_area_struct *vma, |
102 | * Convert calls to our calling convention. | 102 | unsigned long start, unsigned long end) |
103 | */ | 103 | { |
104 | #define flush_tlb_range(vma,start,end) __cpu_flush_user_tlb_range(start,end,vma) | 104 | unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48; |
105 | #define flush_tlb_kernel_range(s,e) __cpu_flush_kern_tlb_range(s,e) | 105 | unsigned long addr; |
106 | start = asid | (start >> 12); | ||
107 | end = asid | (end >> 12); | ||
108 | |||
109 | dsb(ishst); | ||
110 | for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) | ||
111 | asm("tlbi vae1is, %0" : : "r"(addr)); | ||
112 | dsb(ish); | ||
113 | } | ||
114 | |||
115 | static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) | ||
116 | { | ||
117 | unsigned long addr; | ||
118 | start >>= 12; | ||
119 | end >>= 12; | ||
120 | |||
121 | dsb(ishst); | ||
122 | for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) | ||
123 | asm("tlbi vaae1is, %0" : : "r"(addr)); | ||
124 | dsb(ish); | ||
125 | } | ||
106 | 126 | ||
107 | /* | 127 | /* |
108 | * On AArch64, the cache coherency is handled via the set_pte_at() function. | 128 | * On AArch64, the cache coherency is handled via the set_pte_at() function. |
@@ -114,7 +134,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, | |||
114 | * set_pte() does not have a DSB, so make sure that the page table | 134 | * set_pte() does not have a DSB, so make sure that the page table |
115 | * write is visible. | 135 | * write is visible. |
116 | */ | 136 | */ |
117 | dsb(); | 137 | dsb(ishst); |
118 | } | 138 | } |
119 | 139 | ||
120 | #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) | 140 | #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) |
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0172e6d76bf3..7ebcd31ce51c 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h | |||
@@ -20,9 +20,6 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; | |||
20 | #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) | 20 | #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) |
21 | #define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) | 21 | #define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) |
22 | 22 | ||
23 | #define mc_capable() (cpu_topology[0].cluster_id != -1) | ||
24 | #define smt_capable() (cpu_topology[0].thread_id != -1) | ||
25 | |||
26 | void init_cpu_topology(void); | 23 | void init_cpu_topology(void); |
27 | void store_cpu_topology(unsigned int cpuid); | 24 | void store_cpu_topology(unsigned int cpuid); |
28 | const struct cpumask *cpu_coregroup_mask(int cpu); | 25 | const struct cpumask *cpu_coregroup_mask(int cpu); |
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index a4654c656a1e..e5f47df00c24 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h | |||
@@ -29,3 +29,5 @@ | |||
29 | #endif | 29 | #endif |
30 | #define __ARCH_WANT_SYS_CLONE | 30 | #define __ARCH_WANT_SYS_CLONE |
31 | #include <uapi/asm/unistd.h> | 31 | #include <uapi/asm/unistd.h> |
32 | |||
33 | #define NR_syscalls (__NR_syscalls) | ||
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 690ad51cc901..b72cf405b3fe 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h | |||
@@ -53,5 +53,12 @@ struct fpsimd_context { | |||
53 | __uint128_t vregs[32]; | 53 | __uint128_t vregs[32]; |
54 | }; | 54 | }; |
55 | 55 | ||
56 | /* ESR_EL1 context */ | ||
57 | #define ESR_MAGIC 0x45535201 | ||
58 | |||
59 | struct esr_context { | ||
60 | struct _aarch64_ctx head; | ||
61 | u64 esr; | ||
62 | }; | ||
56 | 63 | ||
57 | #endif /* _UAPI__ASM_SIGCONTEXT_H */ | 64 | #endif /* _UAPI__ASM_SIGCONTEXT_H */ |
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index ba5e17a522d5..cdaedad3afe5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile | |||
@@ -7,14 +7,19 @@ AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) | |||
7 | CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) \ | 7 | CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) \ |
8 | -I$(src)/../../../scripts/dtc/libfdt | 8 | -I$(src)/../../../scripts/dtc/libfdt |
9 | 9 | ||
10 | CFLAGS_REMOVE_ftrace.o = -pg | ||
11 | CFLAGS_REMOVE_insn.o = -pg | ||
12 | CFLAGS_REMOVE_return_address.o = -pg | ||
13 | |||
10 | # Object file lists. | 14 | # Object file lists. |
11 | arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \ | 15 | arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \ |
12 | entry-fpsimd.o process.o ptrace.o setup.o signal.o \ | 16 | entry-fpsimd.o process.o ptrace.o setup.o signal.o \ |
13 | sys.o stacktrace.o time.o traps.o io.o vdso.o \ | 17 | sys.o stacktrace.o time.o traps.o io.o vdso.o \ |
14 | hyp-stub.o psci.o cpu_ops.o insn.o | 18 | hyp-stub.o psci.o cpu_ops.o insn.o return_address.o |
15 | 19 | ||
16 | arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ | 20 | arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ |
17 | sys_compat.o | 21 | sys_compat.o |
22 | arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o | ||
18 | arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o | 23 | arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o |
19 | arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o | 24 | arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o |
20 | arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o | 25 | arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o |
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 338b568cd8ae..a85843ddbde8 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c | |||
@@ -44,10 +44,15 @@ EXPORT_SYMBOL(memstart_addr); | |||
44 | /* string / mem functions */ | 44 | /* string / mem functions */ |
45 | EXPORT_SYMBOL(strchr); | 45 | EXPORT_SYMBOL(strchr); |
46 | EXPORT_SYMBOL(strrchr); | 46 | EXPORT_SYMBOL(strrchr); |
47 | EXPORT_SYMBOL(strcmp); | ||
48 | EXPORT_SYMBOL(strncmp); | ||
49 | EXPORT_SYMBOL(strlen); | ||
50 | EXPORT_SYMBOL(strnlen); | ||
47 | EXPORT_SYMBOL(memset); | 51 | EXPORT_SYMBOL(memset); |
48 | EXPORT_SYMBOL(memcpy); | 52 | EXPORT_SYMBOL(memcpy); |
49 | EXPORT_SYMBOL(memmove); | 53 | EXPORT_SYMBOL(memmove); |
50 | EXPORT_SYMBOL(memchr); | 54 | EXPORT_SYMBOL(memchr); |
55 | EXPORT_SYMBOL(memcmp); | ||
51 | 56 | ||
52 | /* atomic bitops */ | 57 | /* atomic bitops */ |
53 | EXPORT_SYMBOL(set_bit); | 58 | EXPORT_SYMBOL(set_bit); |
@@ -56,3 +61,7 @@ EXPORT_SYMBOL(clear_bit); | |||
56 | EXPORT_SYMBOL(test_and_clear_bit); | 61 | EXPORT_SYMBOL(test_and_clear_bit); |
57 | EXPORT_SYMBOL(change_bit); | 62 | EXPORT_SYMBOL(change_bit); |
58 | EXPORT_SYMBOL(test_and_change_bit); | 63 | EXPORT_SYMBOL(test_and_change_bit); |
64 | |||
65 | #ifdef CONFIG_FUNCTION_TRACER | ||
66 | EXPORT_SYMBOL(_mcount); | ||
67 | #endif | ||
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6a27cd6dbfa6..d358ccacfc00 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S | |||
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state) | |||
41 | fpsimd_restore x0, 8 | 41 | fpsimd_restore x0, 8 |
42 | ret | 42 | ret |
43 | ENDPROC(fpsimd_load_state) | 43 | ENDPROC(fpsimd_load_state) |
44 | |||
45 | #ifdef CONFIG_KERNEL_MODE_NEON | ||
46 | |||
47 | /* | ||
48 | * Save the bottom n FP registers. | ||
49 | * | ||
50 | * x0 - pointer to struct fpsimd_partial_state | ||
51 | */ | ||
52 | ENTRY(fpsimd_save_partial_state) | ||
53 | fpsimd_save_partial x0, 1, 8, 9 | ||
54 | ret | ||
55 | ENDPROC(fpsimd_load_partial_state) | ||
56 | |||
57 | /* | ||
58 | * Load the bottom n FP registers. | ||
59 | * | ||
60 | * x0 - pointer to struct fpsimd_partial_state | ||
61 | */ | ||
62 | ENTRY(fpsimd_load_partial_state) | ||
63 | fpsimd_restore_partial x0, 8, 9 | ||
64 | ret | ||
65 | ENDPROC(fpsimd_load_partial_state) | ||
66 | |||
67 | #endif | ||
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S new file mode 100644 index 000000000000..b051871f2965 --- /dev/null +++ b/arch/arm64/kernel/entry-ftrace.S | |||
@@ -0,0 +1,218 @@ | |||
1 | /* | ||
2 | * arch/arm64/kernel/entry-ftrace.S | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Limited | ||
5 | * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #include <linux/linkage.h> | ||
13 | #include <asm/ftrace.h> | ||
14 | #include <asm/insn.h> | ||
15 | |||
16 | /* | ||
17 | * Gcc with -pg will put the following code in the beginning of each function: | ||
18 | * mov x0, x30 | ||
19 | * bl _mcount | ||
20 | * [function's body ...] | ||
21 | * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic | ||
22 | * ftrace is enabled. | ||
23 | * | ||
24 | * Please note that x0 as an argument will not be used here because we can | ||
25 | * get lr(x30) of instrumented function at any time by winding up call stack | ||
26 | * as long as the kernel is compiled without -fomit-frame-pointer. | ||
27 | * (or CONFIG_FRAME_POINTER, this is forced on arm64) | ||
28 | * | ||
29 | * stack layout after mcount_enter in _mcount(): | ||
30 | * | ||
31 | * current sp/fp => 0:+-----+ | ||
32 | * in _mcount() | x29 | -> instrumented function's fp | ||
33 | * +-----+ | ||
34 | * | x30 | -> _mcount()'s lr (= instrumented function's pc) | ||
35 | * old sp => +16:+-----+ | ||
36 | * when instrumented | | | ||
37 | * function calls | ... | | ||
38 | * _mcount() | | | ||
39 | * | | | ||
40 | * instrumented => +xx:+-----+ | ||
41 | * function's fp | x29 | -> parent's fp | ||
42 | * +-----+ | ||
43 | * | x30 | -> instrumented function's lr (= parent's pc) | ||
44 | * +-----+ | ||
45 | * | ... | | ||
46 | */ | ||
47 | |||
48 | .macro mcount_enter | ||
49 | stp x29, x30, [sp, #-16]! | ||
50 | mov x29, sp | ||
51 | .endm | ||
52 | |||
53 | .macro mcount_exit | ||
54 | ldp x29, x30, [sp], #16 | ||
55 | ret | ||
56 | .endm | ||
57 | |||
58 | .macro mcount_adjust_addr rd, rn | ||
59 | sub \rd, \rn, #AARCH64_INSN_SIZE | ||
60 | .endm | ||
61 | |||
62 | /* for instrumented function's parent */ | ||
63 | .macro mcount_get_parent_fp reg | ||
64 | ldr \reg, [x29] | ||
65 | ldr \reg, [\reg] | ||
66 | .endm | ||
67 | |||
68 | /* for instrumented function */ | ||
69 | .macro mcount_get_pc0 reg | ||
70 | mcount_adjust_addr \reg, x30 | ||
71 | .endm | ||
72 | |||
73 | .macro mcount_get_pc reg | ||
74 | ldr \reg, [x29, #8] | ||
75 | mcount_adjust_addr \reg, \reg | ||
76 | .endm | ||
77 | |||
78 | .macro mcount_get_lr reg | ||
79 | ldr \reg, [x29] | ||
80 | ldr \reg, [\reg, #8] | ||
81 | mcount_adjust_addr \reg, \reg | ||
82 | .endm | ||
83 | |||
84 | .macro mcount_get_lr_addr reg | ||
85 | ldr \reg, [x29] | ||
86 | add \reg, \reg, #8 | ||
87 | .endm | ||
88 | |||
89 | #ifndef CONFIG_DYNAMIC_FTRACE | ||
90 | /* | ||
91 | * void _mcount(unsigned long return_address) | ||
92 | * @return_address: return address to instrumented function | ||
93 | * | ||
94 | * This function makes calls, if enabled, to: | ||
95 | * - tracer function to probe instrumented function's entry, | ||
96 | * - ftrace_graph_caller to set up an exit hook | ||
97 | */ | ||
98 | ENTRY(_mcount) | ||
99 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
100 | ldr x0, =ftrace_trace_stop | ||
101 | ldr x0, [x0] // if ftrace_trace_stop | ||
102 | ret // return; | ||
103 | #endif | ||
104 | mcount_enter | ||
105 | |||
106 | ldr x0, =ftrace_trace_function | ||
107 | ldr x2, [x0] | ||
108 | adr x0, ftrace_stub | ||
109 | cmp x0, x2 // if (ftrace_trace_function | ||
110 | b.eq skip_ftrace_call // != ftrace_stub) { | ||
111 | |||
112 | mcount_get_pc x0 // function's pc | ||
113 | mcount_get_lr x1 // function's lr (= parent's pc) | ||
114 | blr x2 // (*ftrace_trace_function)(pc, lr); | ||
115 | |||
116 | #ifndef CONFIG_FUNCTION_GRAPH_TRACER | ||
117 | skip_ftrace_call: // return; | ||
118 | mcount_exit // } | ||
119 | #else | ||
120 | mcount_exit // return; | ||
121 | // } | ||
122 | skip_ftrace_call: | ||
123 | ldr x1, =ftrace_graph_return | ||
124 | ldr x2, [x1] // if ((ftrace_graph_return | ||
125 | cmp x0, x2 // != ftrace_stub) | ||
126 | b.ne ftrace_graph_caller | ||
127 | |||
128 | ldr x1, =ftrace_graph_entry // || (ftrace_graph_entry | ||
129 | ldr x2, [x1] // != ftrace_graph_entry_stub)) | ||
130 | ldr x0, =ftrace_graph_entry_stub | ||
131 | cmp x0, x2 | ||
132 | b.ne ftrace_graph_caller // ftrace_graph_caller(); | ||
133 | |||
134 | mcount_exit | ||
135 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
136 | ENDPROC(_mcount) | ||
137 | |||
138 | #else /* CONFIG_DYNAMIC_FTRACE */ | ||
139 | /* | ||
140 | * _mcount() is used to build the kernel with -pg option, but all the branch | ||
141 | * instructions to _mcount() are replaced to NOP initially at kernel start up, | ||
142 | * and later on, NOP to branch to ftrace_caller() when enabled or branch to | ||
143 | * NOP when disabled per-function base. | ||
144 | */ | ||
145 | ENTRY(_mcount) | ||
146 | ret | ||
147 | ENDPROC(_mcount) | ||
148 | |||
149 | /* | ||
150 | * void ftrace_caller(unsigned long return_address) | ||
151 | * @return_address: return address to instrumented function | ||
152 | * | ||
153 | * This function is a counterpart of _mcount() in 'static' ftrace, and | ||
154 | * makes calls to: | ||
155 | * - tracer function to probe instrumented function's entry, | ||
156 | * - ftrace_graph_caller to set up an exit hook | ||
157 | */ | ||
158 | ENTRY(ftrace_caller) | ||
159 | mcount_enter | ||
160 | |||
161 | mcount_get_pc0 x0 // function's pc | ||
162 | mcount_get_lr x1 // function's lr | ||
163 | |||
164 | .global ftrace_call | ||
165 | ftrace_call: // tracer(pc, lr); | ||
166 | nop // This will be replaced with "bl xxx" | ||
167 | // where xxx can be any kind of tracer. | ||
168 | |||
169 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
170 | .global ftrace_graph_call | ||
171 | ftrace_graph_call: // ftrace_graph_caller(); | ||
172 | nop // If enabled, this will be replaced | ||
173 | // "b ftrace_graph_caller" | ||
174 | #endif | ||
175 | |||
176 | mcount_exit | ||
177 | ENDPROC(ftrace_caller) | ||
178 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
179 | |||
180 | ENTRY(ftrace_stub) | ||
181 | ret | ||
182 | ENDPROC(ftrace_stub) | ||
183 | |||
184 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
185 | /* | ||
186 | * void ftrace_graph_caller(void) | ||
187 | * | ||
188 | * Called from _mcount() or ftrace_caller() when function_graph tracer is | ||
189 | * selected. | ||
190 | * This function w/ prepare_ftrace_return() fakes link register's value on | ||
191 | * the call stack in order to intercept instrumented function's return path | ||
192 | * and run return_to_handler() later on its exit. | ||
193 | */ | ||
194 | ENTRY(ftrace_graph_caller) | ||
195 | mcount_get_lr_addr x0 // pointer to function's saved lr | ||
196 | mcount_get_pc x1 // function's pc | ||
197 | mcount_get_parent_fp x2 // parent's fp | ||
198 | bl prepare_ftrace_return // prepare_ftrace_return(&lr, pc, fp) | ||
199 | |||
200 | mcount_exit | ||
201 | ENDPROC(ftrace_graph_caller) | ||
202 | |||
203 | /* | ||
204 | * void return_to_handler(void) | ||
205 | * | ||
206 | * Run ftrace_return_to_handler() before going back to parent. | ||
207 | * @fp is checked against the value passed by ftrace_graph_caller() | ||
208 | * only when CONFIG_FUNCTION_GRAPH_FP_TEST is enabled. | ||
209 | */ | ||
210 | ENTRY(return_to_handler) | ||
211 | str x0, [sp, #-16]! | ||
212 | mov x0, x29 // parent's fp | ||
213 | bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); | ||
214 | mov x30, x0 // restore the original return address | ||
215 | ldr x0, [sp], #16 | ||
216 | ret | ||
217 | END(return_to_handler) | ||
218 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 39ac630d83de..bf017f4ffb4f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S | |||
@@ -60,6 +60,9 @@ | |||
60 | push x0, x1 | 60 | push x0, x1 |
61 | .if \el == 0 | 61 | .if \el == 0 |
62 | mrs x21, sp_el0 | 62 | mrs x21, sp_el0 |
63 | get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, | ||
64 | ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug | ||
65 | disable_step_tsk x19, x20 // exceptions when scheduling. | ||
63 | .else | 66 | .else |
64 | add x21, sp, #S_FRAME_SIZE | 67 | add x21, sp, #S_FRAME_SIZE |
65 | .endif | 68 | .endif |
@@ -259,7 +262,7 @@ el1_da: | |||
259 | * Data abort handling | 262 | * Data abort handling |
260 | */ | 263 | */ |
261 | mrs x0, far_el1 | 264 | mrs x0, far_el1 |
262 | enable_dbg_if_not_stepping x2 | 265 | enable_dbg |
263 | // re-enable interrupts if they were enabled in the aborted context | 266 | // re-enable interrupts if they were enabled in the aborted context |
264 | tbnz x23, #7, 1f // PSR_I_BIT | 267 | tbnz x23, #7, 1f // PSR_I_BIT |
265 | enable_irq | 268 | enable_irq |
@@ -275,6 +278,7 @@ el1_sp_pc: | |||
275 | * Stack or PC alignment exception handling | 278 | * Stack or PC alignment exception handling |
276 | */ | 279 | */ |
277 | mrs x0, far_el1 | 280 | mrs x0, far_el1 |
281 | enable_dbg | ||
278 | mov x1, x25 | 282 | mov x1, x25 |
279 | mov x2, sp | 283 | mov x2, sp |
280 | b do_sp_pc_abort | 284 | b do_sp_pc_abort |
@@ -282,6 +286,7 @@ el1_undef: | |||
282 | /* | 286 | /* |
283 | * Undefined instruction | 287 | * Undefined instruction |
284 | */ | 288 | */ |
289 | enable_dbg | ||
285 | mov x0, sp | 290 | mov x0, sp |
286 | b do_undefinstr | 291 | b do_undefinstr |
287 | el1_dbg: | 292 | el1_dbg: |
@@ -294,10 +299,11 @@ el1_dbg: | |||
294 | mrs x0, far_el1 | 299 | mrs x0, far_el1 |
295 | mov x2, sp // struct pt_regs | 300 | mov x2, sp // struct pt_regs |
296 | bl do_debug_exception | 301 | bl do_debug_exception |
297 | 302 | enable_dbg | |
298 | kernel_exit 1 | 303 | kernel_exit 1 |
299 | el1_inv: | 304 | el1_inv: |
300 | // TODO: add support for undefined instructions in kernel mode | 305 | // TODO: add support for undefined instructions in kernel mode |
306 | enable_dbg | ||
301 | mov x0, sp | 307 | mov x0, sp |
302 | mov x1, #BAD_SYNC | 308 | mov x1, #BAD_SYNC |
303 | mrs x2, esr_el1 | 309 | mrs x2, esr_el1 |
@@ -307,7 +313,7 @@ ENDPROC(el1_sync) | |||
307 | .align 6 | 313 | .align 6 |
308 | el1_irq: | 314 | el1_irq: |
309 | kernel_entry 1 | 315 | kernel_entry 1 |
310 | enable_dbg_if_not_stepping x0 | 316 | enable_dbg |
311 | #ifdef CONFIG_TRACE_IRQFLAGS | 317 | #ifdef CONFIG_TRACE_IRQFLAGS |
312 | bl trace_hardirqs_off | 318 | bl trace_hardirqs_off |
313 | #endif | 319 | #endif |
@@ -332,8 +338,7 @@ ENDPROC(el1_irq) | |||
332 | #ifdef CONFIG_PREEMPT | 338 | #ifdef CONFIG_PREEMPT |
333 | el1_preempt: | 339 | el1_preempt: |
334 | mov x24, lr | 340 | mov x24, lr |
335 | 1: enable_dbg | 341 | 1: bl preempt_schedule_irq // irq en/disable is done inside |
336 | bl preempt_schedule_irq // irq en/disable is done inside | ||
337 | ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS | 342 | ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS |
338 | tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? | 343 | tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? |
339 | ret x24 | 344 | ret x24 |
@@ -349,7 +354,7 @@ el0_sync: | |||
349 | lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class | 354 | lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class |
350 | cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state | 355 | cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state |
351 | b.eq el0_svc | 356 | b.eq el0_svc |
352 | adr lr, ret_from_exception | 357 | adr lr, ret_to_user |
353 | cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 | 358 | cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 |
354 | b.eq el0_da | 359 | b.eq el0_da |
355 | cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 | 360 | cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 |
@@ -378,7 +383,7 @@ el0_sync_compat: | |||
378 | lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class | 383 | lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class |
379 | cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state | 384 | cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state |
380 | b.eq el0_svc_compat | 385 | b.eq el0_svc_compat |
381 | adr lr, ret_from_exception | 386 | adr lr, ret_to_user |
382 | cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 | 387 | cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 |
383 | b.eq el0_da | 388 | b.eq el0_da |
384 | cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 | 389 | cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 |
@@ -423,11 +428,8 @@ el0_da: | |||
423 | */ | 428 | */ |
424 | mrs x0, far_el1 | 429 | mrs x0, far_el1 |
425 | bic x0, x0, #(0xff << 56) | 430 | bic x0, x0, #(0xff << 56) |
426 | disable_step x1 | ||
427 | isb | ||
428 | enable_dbg | ||
429 | // enable interrupts before calling the main handler | 431 | // enable interrupts before calling the main handler |
430 | enable_irq | 432 | enable_dbg_and_irq |
431 | mov x1, x25 | 433 | mov x1, x25 |
432 | mov x2, sp | 434 | mov x2, sp |
433 | b do_mem_abort | 435 | b do_mem_abort |
@@ -436,11 +438,8 @@ el0_ia: | |||
436 | * Instruction abort handling | 438 | * Instruction abort handling |
437 | */ | 439 | */ |
438 | mrs x0, far_el1 | 440 | mrs x0, far_el1 |
439 | disable_step x1 | ||
440 | isb | ||
441 | enable_dbg | ||
442 | // enable interrupts before calling the main handler | 441 | // enable interrupts before calling the main handler |
443 | enable_irq | 442 | enable_dbg_and_irq |
444 | orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts | 443 | orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts |
445 | mov x2, sp | 444 | mov x2, sp |
446 | b do_mem_abort | 445 | b do_mem_abort |
@@ -448,6 +447,7 @@ el0_fpsimd_acc: | |||
448 | /* | 447 | /* |
449 | * Floating Point or Advanced SIMD access | 448 | * Floating Point or Advanced SIMD access |
450 | */ | 449 | */ |
450 | enable_dbg | ||
451 | mov x0, x25 | 451 | mov x0, x25 |
452 | mov x1, sp | 452 | mov x1, sp |
453 | b do_fpsimd_acc | 453 | b do_fpsimd_acc |
@@ -455,6 +455,7 @@ el0_fpsimd_exc: | |||
455 | /* | 455 | /* |
456 | * Floating Point or Advanced SIMD exception | 456 | * Floating Point or Advanced SIMD exception |
457 | */ | 457 | */ |
458 | enable_dbg | ||
458 | mov x0, x25 | 459 | mov x0, x25 |
459 | mov x1, sp | 460 | mov x1, sp |
460 | b do_fpsimd_exc | 461 | b do_fpsimd_exc |
@@ -463,11 +464,8 @@ el0_sp_pc: | |||
463 | * Stack or PC alignment exception handling | 464 | * Stack or PC alignment exception handling |
464 | */ | 465 | */ |
465 | mrs x0, far_el1 | 466 | mrs x0, far_el1 |
466 | disable_step x1 | ||
467 | isb | ||
468 | enable_dbg | ||
469 | // enable interrupts before calling the main handler | 467 | // enable interrupts before calling the main handler |
470 | enable_irq | 468 | enable_dbg_and_irq |
471 | mov x1, x25 | 469 | mov x1, x25 |
472 | mov x2, sp | 470 | mov x2, sp |
473 | b do_sp_pc_abort | 471 | b do_sp_pc_abort |
@@ -475,9 +473,9 @@ el0_undef: | |||
475 | /* | 473 | /* |
476 | * Undefined instruction | 474 | * Undefined instruction |
477 | */ | 475 | */ |
478 | mov x0, sp | ||
479 | // enable interrupts before calling the main handler | 476 | // enable interrupts before calling the main handler |
480 | enable_irq | 477 | enable_dbg_and_irq |
478 | mov x0, sp | ||
481 | b do_undefinstr | 479 | b do_undefinstr |
482 | el0_dbg: | 480 | el0_dbg: |
483 | /* | 481 | /* |
@@ -485,11 +483,13 @@ el0_dbg: | |||
485 | */ | 483 | */ |
486 | tbnz x24, #0, el0_inv // EL0 only | 484 | tbnz x24, #0, el0_inv // EL0 only |
487 | mrs x0, far_el1 | 485 | mrs x0, far_el1 |
488 | disable_step x1 | ||
489 | mov x1, x25 | 486 | mov x1, x25 |
490 | mov x2, sp | 487 | mov x2, sp |
491 | b do_debug_exception | 488 | bl do_debug_exception |
489 | enable_dbg | ||
490 | b ret_to_user | ||
492 | el0_inv: | 491 | el0_inv: |
492 | enable_dbg | ||
493 | mov x0, sp | 493 | mov x0, sp |
494 | mov x1, #BAD_SYNC | 494 | mov x1, #BAD_SYNC |
495 | mrs x2, esr_el1 | 495 | mrs x2, esr_el1 |
@@ -500,15 +500,12 @@ ENDPROC(el0_sync) | |||
500 | el0_irq: | 500 | el0_irq: |
501 | kernel_entry 0 | 501 | kernel_entry 0 |
502 | el0_irq_naked: | 502 | el0_irq_naked: |
503 | disable_step x1 | ||
504 | isb | ||
505 | enable_dbg | 503 | enable_dbg |
506 | #ifdef CONFIG_TRACE_IRQFLAGS | 504 | #ifdef CONFIG_TRACE_IRQFLAGS |
507 | bl trace_hardirqs_off | 505 | bl trace_hardirqs_off |
508 | #endif | 506 | #endif |
509 | 507 | ||
510 | irq_handler | 508 | irq_handler |
511 | get_thread_info tsk | ||
512 | 509 | ||
513 | #ifdef CONFIG_TRACE_IRQFLAGS | 510 | #ifdef CONFIG_TRACE_IRQFLAGS |
514 | bl trace_hardirqs_on | 511 | bl trace_hardirqs_on |
@@ -517,14 +514,6 @@ el0_irq_naked: | |||
517 | ENDPROC(el0_irq) | 514 | ENDPROC(el0_irq) |
518 | 515 | ||
519 | /* | 516 | /* |
520 | * This is the return code to user mode for abort handlers | ||
521 | */ | ||
522 | ret_from_exception: | ||
523 | get_thread_info tsk | ||
524 | b ret_to_user | ||
525 | ENDPROC(ret_from_exception) | ||
526 | |||
527 | /* | ||
528 | * Register switch for AArch64. The callee-saved registers need to be saved | 517 | * Register switch for AArch64. The callee-saved registers need to be saved |
529 | * and restored. On entry: | 518 | * and restored. On entry: |
530 | * x0 = previous task_struct (must be preserved across the switch) | 519 | * x0 = previous task_struct (must be preserved across the switch) |
@@ -563,10 +552,7 @@ ret_fast_syscall: | |||
563 | ldr x1, [tsk, #TI_FLAGS] | 552 | ldr x1, [tsk, #TI_FLAGS] |
564 | and x2, x1, #_TIF_WORK_MASK | 553 | and x2, x1, #_TIF_WORK_MASK |
565 | cbnz x2, fast_work_pending | 554 | cbnz x2, fast_work_pending |
566 | tbz x1, #TIF_SINGLESTEP, fast_exit | 555 | enable_step_tsk x1, x2 |
567 | disable_dbg | ||
568 | enable_step x2 | ||
569 | fast_exit: | ||
570 | kernel_exit 0, ret = 1 | 556 | kernel_exit 0, ret = 1 |
571 | 557 | ||
572 | /* | 558 | /* |
@@ -576,7 +562,7 @@ fast_work_pending: | |||
576 | str x0, [sp, #S_X0] // returned x0 | 562 | str x0, [sp, #S_X0] // returned x0 |
577 | work_pending: | 563 | work_pending: |
578 | tbnz x1, #TIF_NEED_RESCHED, work_resched | 564 | tbnz x1, #TIF_NEED_RESCHED, work_resched |
579 | /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ | 565 | /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ |
580 | ldr x2, [sp, #S_PSTATE] | 566 | ldr x2, [sp, #S_PSTATE] |
581 | mov x0, sp // 'regs' | 567 | mov x0, sp // 'regs' |
582 | tst x2, #PSR_MODE_MASK // user mode regs? | 568 | tst x2, #PSR_MODE_MASK // user mode regs? |
@@ -585,7 +571,6 @@ work_pending: | |||
585 | bl do_notify_resume | 571 | bl do_notify_resume |
586 | b ret_to_user | 572 | b ret_to_user |
587 | work_resched: | 573 | work_resched: |
588 | enable_dbg | ||
589 | bl schedule | 574 | bl schedule |
590 | 575 | ||
591 | /* | 576 | /* |
@@ -596,9 +581,7 @@ ret_to_user: | |||
596 | ldr x1, [tsk, #TI_FLAGS] | 581 | ldr x1, [tsk, #TI_FLAGS] |
597 | and x2, x1, #_TIF_WORK_MASK | 582 | and x2, x1, #_TIF_WORK_MASK |
598 | cbnz x2, work_pending | 583 | cbnz x2, work_pending |
599 | tbz x1, #TIF_SINGLESTEP, no_work_pending | 584 | enable_step_tsk x1, x2 |
600 | disable_dbg | ||
601 | enable_step x2 | ||
602 | no_work_pending: | 585 | no_work_pending: |
603 | kernel_exit 0, ret = 0 | 586 | kernel_exit 0, ret = 0 |
604 | ENDPROC(ret_to_user) | 587 | ENDPROC(ret_to_user) |
@@ -625,14 +608,11 @@ el0_svc: | |||
625 | mov sc_nr, #__NR_syscalls | 608 | mov sc_nr, #__NR_syscalls |
626 | el0_svc_naked: // compat entry point | 609 | el0_svc_naked: // compat entry point |
627 | stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number | 610 | stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number |
628 | disable_step x16 | 611 | enable_dbg_and_irq |
629 | isb | ||
630 | enable_dbg | ||
631 | enable_irq | ||
632 | 612 | ||
633 | get_thread_info tsk | 613 | ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks |
634 | ldr x16, [tsk, #TI_FLAGS] // check for syscall tracing | 614 | tst x16, #_TIF_SYSCALL_WORK |
635 | tbnz x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls? | 615 | b.ne __sys_trace |
636 | adr lr, ret_fast_syscall // return address | 616 | adr lr, ret_fast_syscall // return address |
637 | cmp scno, sc_nr // check upper syscall limit | 617 | cmp scno, sc_nr // check upper syscall limit |
638 | b.hs ni_sys | 618 | b.hs ni_sys |
@@ -648,9 +628,8 @@ ENDPROC(el0_svc) | |||
648 | * switches, and waiting for our parent to respond. | 628 | * switches, and waiting for our parent to respond. |
649 | */ | 629 | */ |
650 | __sys_trace: | 630 | __sys_trace: |
651 | mov x1, sp | 631 | mov x0, sp |
652 | mov w0, #0 // trace entry | 632 | bl syscall_trace_enter |
653 | bl syscall_trace | ||
654 | adr lr, __sys_trace_return // return address | 633 | adr lr, __sys_trace_return // return address |
655 | uxtw scno, w0 // syscall number (possibly new) | 634 | uxtw scno, w0 // syscall number (possibly new) |
656 | mov x1, sp // pointer to regs | 635 | mov x1, sp // pointer to regs |
@@ -665,9 +644,8 @@ __sys_trace: | |||
665 | 644 | ||
666 | __sys_trace_return: | 645 | __sys_trace_return: |
667 | str x0, [sp] // save returned x0 | 646 | str x0, [sp] // save returned x0 |
668 | mov x1, sp | 647 | mov x0, sp |
669 | mov w0, #1 // trace exit | 648 | bl syscall_trace_exit |
670 | bl syscall_trace | ||
671 | b ret_to_user | 649 | b ret_to_user |
672 | 650 | ||
673 | /* | 651 | /* |
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4aef42a04bdc..ad8aebb1cdef 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c | |||
@@ -35,6 +35,60 @@ | |||
35 | #define FPEXC_IDF (1 << 7) | 35 | #define FPEXC_IDF (1 << 7) |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * In order to reduce the number of times the FPSIMD state is needlessly saved | ||
39 | * and restored, we need to keep track of two things: | ||
40 | * (a) for each task, we need to remember which CPU was the last one to have | ||
41 | * the task's FPSIMD state loaded into its FPSIMD registers; | ||
42 | * (b) for each CPU, we need to remember which task's userland FPSIMD state has | ||
43 | * been loaded into its FPSIMD registers most recently, or whether it has | ||
44 | * been used to perform kernel mode NEON in the meantime. | ||
45 | * | ||
46 | * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to | ||
47 | * the id of the current CPU everytime the state is loaded onto a CPU. For (b), | ||
48 | * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the | ||
49 | * address of the userland FPSIMD state of the task that was loaded onto the CPU | ||
50 | * the most recently, or NULL if kernel mode NEON has been performed after that. | ||
51 | * | ||
52 | * With this in place, we no longer have to restore the next FPSIMD state right | ||
53 | * when switching between tasks. Instead, we can defer this check to userland | ||
54 | * resume, at which time we verify whether the CPU's fpsimd_last_state and the | ||
55 | * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we | ||
56 | * can omit the FPSIMD restore. | ||
57 | * | ||
58 | * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to | ||
59 | * indicate whether or not the userland FPSIMD state of the current task is | ||
60 | * present in the registers. The flag is set unless the FPSIMD registers of this | ||
61 | * CPU currently contain the most recent userland FPSIMD state of the current | ||
62 | * task. | ||
63 | * | ||
64 | * For a certain task, the sequence may look something like this: | ||
65 | * - the task gets scheduled in; if both the task's fpsimd_state.cpu field | ||
66 | * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu | ||
67 | * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is | ||
68 | * cleared, otherwise it is set; | ||
69 | * | ||
70 | * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's | ||
71 | * userland FPSIMD state is copied from memory to the registers, the task's | ||
72 | * fpsimd_state.cpu field is set to the id of the current CPU, the current | ||
73 | * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the | ||
74 | * TIF_FOREIGN_FPSTATE flag is cleared; | ||
75 | * | ||
76 | * - the task executes an ordinary syscall; upon return to userland, the | ||
77 | * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is | ||
78 | * restored; | ||
79 | * | ||
80 | * - the task executes a syscall which executes some NEON instructions; this is | ||
81 | * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD | ||
82 | * register contents to memory, clears the fpsimd_last_state per-cpu variable | ||
83 | * and sets the TIF_FOREIGN_FPSTATE flag; | ||
84 | * | ||
85 | * - the task gets preempted after kernel_neon_end() is called; as we have not | ||
86 | * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so | ||
87 | * whatever is in the FPSIMD registers is not saved to memory, but discarded. | ||
88 | */ | ||
89 | static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); | ||
90 | |||
91 | /* | ||
38 | * Trapped FP/ASIMD access. | 92 | * Trapped FP/ASIMD access. |
39 | */ | 93 | */ |
40 | void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) | 94 | void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) |
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) | |||
72 | 126 | ||
73 | void fpsimd_thread_switch(struct task_struct *next) | 127 | void fpsimd_thread_switch(struct task_struct *next) |
74 | { | 128 | { |
75 | /* check if not kernel threads */ | 129 | /* |
76 | if (current->mm) | 130 | * Save the current FPSIMD state to memory, but only if whatever is in |
131 | * the registers is in fact the most recent userland FPSIMD state of | ||
132 | * 'current'. | ||
133 | */ | ||
134 | if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
77 | fpsimd_save_state(¤t->thread.fpsimd_state); | 135 | fpsimd_save_state(¤t->thread.fpsimd_state); |
78 | if (next->mm) | 136 | |
79 | fpsimd_load_state(&next->thread.fpsimd_state); | 137 | if (next->mm) { |
138 | /* | ||
139 | * If we are switching to a task whose most recent userland | ||
140 | * FPSIMD state is already in the registers of *this* cpu, | ||
141 | * we can skip loading the state from memory. Otherwise, set | ||
142 | * the TIF_FOREIGN_FPSTATE flag so the state will be loaded | ||
143 | * upon the next return to userland. | ||
144 | */ | ||
145 | struct fpsimd_state *st = &next->thread.fpsimd_state; | ||
146 | |||
147 | if (__this_cpu_read(fpsimd_last_state) == st | ||
148 | && st->cpu == smp_processor_id()) | ||
149 | clear_ti_thread_flag(task_thread_info(next), | ||
150 | TIF_FOREIGN_FPSTATE); | ||
151 | else | ||
152 | set_ti_thread_flag(task_thread_info(next), | ||
153 | TIF_FOREIGN_FPSTATE); | ||
154 | } | ||
80 | } | 155 | } |
81 | 156 | ||
82 | void fpsimd_flush_thread(void) | 157 | void fpsimd_flush_thread(void) |
83 | { | 158 | { |
84 | preempt_disable(); | ||
85 | memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); | 159 | memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); |
86 | fpsimd_load_state(¤t->thread.fpsimd_state); | 160 | set_thread_flag(TIF_FOREIGN_FPSTATE); |
161 | } | ||
162 | |||
163 | /* | ||
164 | * Save the userland FPSIMD state of 'current' to memory, but only if the state | ||
165 | * currently held in the registers does in fact belong to 'current' | ||
166 | */ | ||
167 | void fpsimd_preserve_current_state(void) | ||
168 | { | ||
169 | preempt_disable(); | ||
170 | if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
171 | fpsimd_save_state(¤t->thread.fpsimd_state); | ||
172 | preempt_enable(); | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Load the userland FPSIMD state of 'current' from memory, but only if the | ||
177 | * FPSIMD state already held in the registers is /not/ the most recent FPSIMD | ||
178 | * state of 'current' | ||
179 | */ | ||
180 | void fpsimd_restore_current_state(void) | ||
181 | { | ||
182 | preempt_disable(); | ||
183 | if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { | ||
184 | struct fpsimd_state *st = ¤t->thread.fpsimd_state; | ||
185 | |||
186 | fpsimd_load_state(st); | ||
187 | this_cpu_write(fpsimd_last_state, st); | ||
188 | st->cpu = smp_processor_id(); | ||
189 | } | ||
190 | preempt_enable(); | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Load an updated userland FPSIMD state for 'current' from memory and set the | ||
195 | * flag that indicates that the FPSIMD register contents are the most recent | ||
196 | * FPSIMD state of 'current' | ||
197 | */ | ||
198 | void fpsimd_update_current_state(struct fpsimd_state *state) | ||
199 | { | ||
200 | preempt_disable(); | ||
201 | fpsimd_load_state(state); | ||
202 | if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { | ||
203 | struct fpsimd_state *st = ¤t->thread.fpsimd_state; | ||
204 | |||
205 | this_cpu_write(fpsimd_last_state, st); | ||
206 | st->cpu = smp_processor_id(); | ||
207 | } | ||
87 | preempt_enable(); | 208 | preempt_enable(); |
88 | } | 209 | } |
89 | 210 | ||
211 | /* | ||
212 | * Invalidate live CPU copies of task t's FPSIMD state | ||
213 | */ | ||
214 | void fpsimd_flush_task_state(struct task_struct *t) | ||
215 | { | ||
216 | t->thread.fpsimd_state.cpu = NR_CPUS; | ||
217 | } | ||
218 | |||
90 | #ifdef CONFIG_KERNEL_MODE_NEON | 219 | #ifdef CONFIG_KERNEL_MODE_NEON |
91 | 220 | ||
221 | static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); | ||
222 | static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); | ||
223 | |||
92 | /* | 224 | /* |
93 | * Kernel-side NEON support functions | 225 | * Kernel-side NEON support functions |
94 | */ | 226 | */ |
95 | void kernel_neon_begin(void) | 227 | void kernel_neon_begin_partial(u32 num_regs) |
96 | { | 228 | { |
97 | /* Avoid using the NEON in interrupt context */ | 229 | if (in_interrupt()) { |
98 | BUG_ON(in_interrupt()); | 230 | struct fpsimd_partial_state *s = this_cpu_ptr( |
99 | preempt_disable(); | 231 | in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); |
100 | 232 | ||
101 | if (current->mm) | 233 | BUG_ON(num_regs > 32); |
102 | fpsimd_save_state(¤t->thread.fpsimd_state); | 234 | fpsimd_save_partial_state(s, roundup(num_regs, 2)); |
235 | } else { | ||
236 | /* | ||
237 | * Save the userland FPSIMD state if we have one and if we | ||
238 | * haven't done so already. Clear fpsimd_last_state to indicate | ||
239 | * that there is no longer userland FPSIMD state in the | ||
240 | * registers. | ||
241 | */ | ||
242 | preempt_disable(); | ||
243 | if (current->mm && | ||
244 | !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) | ||
245 | fpsimd_save_state(¤t->thread.fpsimd_state); | ||
246 | this_cpu_write(fpsimd_last_state, NULL); | ||
247 | } | ||
103 | } | 248 | } |
104 | EXPORT_SYMBOL(kernel_neon_begin); | 249 | EXPORT_SYMBOL(kernel_neon_begin_partial); |
105 | 250 | ||
106 | void kernel_neon_end(void) | 251 | void kernel_neon_end(void) |
107 | { | 252 | { |
108 | if (current->mm) | 253 | if (in_interrupt()) { |
109 | fpsimd_load_state(¤t->thread.fpsimd_state); | 254 | struct fpsimd_partial_state *s = this_cpu_ptr( |
110 | 255 | in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); | |
111 | preempt_enable(); | 256 | fpsimd_load_partial_state(s); |
257 | } else { | ||
258 | preempt_enable(); | ||
259 | } | ||
112 | } | 260 | } |
113 | EXPORT_SYMBOL(kernel_neon_end); | 261 | EXPORT_SYMBOL(kernel_neon_end); |
114 | 262 | ||
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, | |||
120 | { | 268 | { |
121 | switch (cmd) { | 269 | switch (cmd) { |
122 | case CPU_PM_ENTER: | 270 | case CPU_PM_ENTER: |
123 | if (current->mm) | 271 | if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) |
124 | fpsimd_save_state(¤t->thread.fpsimd_state); | 272 | fpsimd_save_state(¤t->thread.fpsimd_state); |
125 | break; | 273 | break; |
126 | case CPU_PM_EXIT: | 274 | case CPU_PM_EXIT: |
127 | if (current->mm) | 275 | if (current->mm) |
128 | fpsimd_load_state(¤t->thread.fpsimd_state); | 276 | set_thread_flag(TIF_FOREIGN_FPSTATE); |
129 | break; | 277 | break; |
130 | case CPU_PM_ENTER_FAILED: | 278 | case CPU_PM_ENTER_FAILED: |
131 | default: | 279 | default: |
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c new file mode 100644 index 000000000000..7924d73b6476 --- /dev/null +++ b/arch/arm64/kernel/ftrace.c | |||
@@ -0,0 +1,176 @@ | |||
1 | /* | ||
2 | * arch/arm64/kernel/ftrace.c | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Limited | ||
5 | * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #include <linux/ftrace.h> | ||
13 | #include <linux/swab.h> | ||
14 | #include <linux/uaccess.h> | ||
15 | |||
16 | #include <asm/cacheflush.h> | ||
17 | #include <asm/ftrace.h> | ||
18 | #include <asm/insn.h> | ||
19 | |||
20 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
21 | /* | ||
22 | * Replace a single instruction, which may be a branch or NOP. | ||
23 | * If @validate == true, a replaced instruction is checked against 'old'. | ||
24 | */ | ||
25 | static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, | ||
26 | bool validate) | ||
27 | { | ||
28 | u32 replaced; | ||
29 | |||
30 | /* | ||
31 | * Note: | ||
32 | * Due to modules and __init, code can disappear and change, | ||
33 | * we need to protect against faulting as well as code changing. | ||
34 | * We do this by aarch64_insn_*() which use the probe_kernel_*(). | ||
35 | * | ||
36 | * No lock is held here because all the modifications are run | ||
37 | * through stop_machine(). | ||
38 | */ | ||
39 | if (validate) { | ||
40 | if (aarch64_insn_read((void *)pc, &replaced)) | ||
41 | return -EFAULT; | ||
42 | |||
43 | if (replaced != old) | ||
44 | return -EINVAL; | ||
45 | } | ||
46 | if (aarch64_insn_patch_text_nosync((void *)pc, new)) | ||
47 | return -EPERM; | ||
48 | |||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * Replace tracer function in ftrace_caller() | ||
54 | */ | ||
55 | int ftrace_update_ftrace_func(ftrace_func_t func) | ||
56 | { | ||
57 | unsigned long pc; | ||
58 | u32 new; | ||
59 | |||
60 | pc = (unsigned long)&ftrace_call; | ||
61 | new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, true); | ||
62 | |||
63 | return ftrace_modify_code(pc, 0, new, false); | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * Turn on the call to ftrace_caller() in instrumented function | ||
68 | */ | ||
69 | int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) | ||
70 | { | ||
71 | unsigned long pc = rec->ip; | ||
72 | u32 old, new; | ||
73 | |||
74 | old = aarch64_insn_gen_nop(); | ||
75 | new = aarch64_insn_gen_branch_imm(pc, addr, true); | ||
76 | |||
77 | return ftrace_modify_code(pc, old, new, true); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Turn off the call to ftrace_caller() in instrumented function | ||
82 | */ | ||
83 | int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, | ||
84 | unsigned long addr) | ||
85 | { | ||
86 | unsigned long pc = rec->ip; | ||
87 | u32 old, new; | ||
88 | |||
89 | old = aarch64_insn_gen_branch_imm(pc, addr, true); | ||
90 | new = aarch64_insn_gen_nop(); | ||
91 | |||
92 | return ftrace_modify_code(pc, old, new, true); | ||
93 | } | ||
94 | |||
95 | int __init ftrace_dyn_arch_init(void) | ||
96 | { | ||
97 | return 0; | ||
98 | } | ||
99 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
100 | |||
101 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
102 | /* | ||
103 | * function_graph tracer expects ftrace_return_to_handler() to be called | ||
104 | * on the way back to parent. For this purpose, this function is called | ||
105 | * in _mcount() or ftrace_caller() to replace return address (*parent) on | ||
106 | * the call stack to return_to_handler. | ||
107 | * | ||
108 | * Note that @frame_pointer is used only for sanity check later. | ||
109 | */ | ||
110 | void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, | ||
111 | unsigned long frame_pointer) | ||
112 | { | ||
113 | unsigned long return_hooker = (unsigned long)&return_to_handler; | ||
114 | unsigned long old; | ||
115 | struct ftrace_graph_ent trace; | ||
116 | int err; | ||
117 | |||
118 | if (unlikely(atomic_read(¤t->tracing_graph_pause))) | ||
119 | return; | ||
120 | |||
121 | /* | ||
122 | * Note: | ||
123 | * No protection against faulting at *parent, which may be seen | ||
124 | * on other archs. It's unlikely on AArch64. | ||
125 | */ | ||
126 | old = *parent; | ||
127 | *parent = return_hooker; | ||
128 | |||
129 | trace.func = self_addr; | ||
130 | trace.depth = current->curr_ret_stack + 1; | ||
131 | |||
132 | /* Only trace if the calling function expects to */ | ||
133 | if (!ftrace_graph_entry(&trace)) { | ||
134 | *parent = old; | ||
135 | return; | ||
136 | } | ||
137 | |||
138 | err = ftrace_push_return_trace(old, self_addr, &trace.depth, | ||
139 | frame_pointer); | ||
140 | if (err == -EBUSY) { | ||
141 | *parent = old; | ||
142 | return; | ||
143 | } | ||
144 | } | ||
145 | |||
146 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
147 | /* | ||
148 | * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() | ||
149 | * depending on @enable. | ||
150 | */ | ||
151 | static int ftrace_modify_graph_caller(bool enable) | ||
152 | { | ||
153 | unsigned long pc = (unsigned long)&ftrace_graph_call; | ||
154 | u32 branch, nop; | ||
155 | |||
156 | branch = aarch64_insn_gen_branch_imm(pc, | ||
157 | (unsigned long)ftrace_graph_caller, false); | ||
158 | nop = aarch64_insn_gen_nop(); | ||
159 | |||
160 | if (enable) | ||
161 | return ftrace_modify_code(pc, nop, branch, true); | ||
162 | else | ||
163 | return ftrace_modify_code(pc, branch, nop, true); | ||
164 | } | ||
165 | |||
166 | int ftrace_enable_ftrace_graph_caller(void) | ||
167 | { | ||
168 | return ftrace_modify_graph_caller(true); | ||
169 | } | ||
170 | |||
171 | int ftrace_disable_ftrace_graph_caller(void) | ||
172 | { | ||
173 | return ftrace_modify_graph_caller(false); | ||
174 | } | ||
175 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
176 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 738291b5be29..a96d3a6a63f6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S | |||
@@ -342,11 +342,9 @@ ENTRY(set_cpu_boot_mode_flag) | |||
342 | cmp w20, #BOOT_CPU_MODE_EL2 | 342 | cmp w20, #BOOT_CPU_MODE_EL2 |
343 | b.ne 1f | 343 | b.ne 1f |
344 | add x1, x1, #4 | 344 | add x1, x1, #4 |
345 | 1: dc cvac, x1 // Clean potentially dirty cache line | 345 | 1: str w20, [x1] // This CPU has booted in EL1 |
346 | dsb sy | 346 | dmb sy |
347 | str w20, [x1] // This CPU has booted in EL1 | 347 | dc ivac, x1 // Invalidate potentially stale cache line |
348 | dc civac, x1 // Clean&invalidate potentially stale cache line | ||
349 | dsb sy | ||
350 | ret | 348 | ret |
351 | ENDPROC(set_cpu_boot_mode_flag) | 349 | ENDPROC(set_cpu_boot_mode_flag) |
352 | 350 | ||
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index bee789757806..df1cf15377b4 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c | |||
@@ -20,6 +20,7 @@ | |||
20 | 20 | ||
21 | #define pr_fmt(fmt) "hw-breakpoint: " fmt | 21 | #define pr_fmt(fmt) "hw-breakpoint: " fmt |
22 | 22 | ||
23 | #include <linux/compat.h> | ||
23 | #include <linux/cpu_pm.h> | 24 | #include <linux/cpu_pm.h> |
24 | #include <linux/errno.h> | 25 | #include <linux/errno.h> |
25 | #include <linux/hw_breakpoint.h> | 26 | #include <linux/hw_breakpoint.h> |
@@ -27,7 +28,6 @@ | |||
27 | #include <linux/ptrace.h> | 28 | #include <linux/ptrace.h> |
28 | #include <linux/smp.h> | 29 | #include <linux/smp.h> |
29 | 30 | ||
30 | #include <asm/compat.h> | ||
31 | #include <asm/current.h> | 31 | #include <asm/current.h> |
32 | #include <asm/debug-monitors.h> | 32 | #include <asm/debug-monitors.h> |
33 | #include <asm/hw_breakpoint.h> | 33 | #include <asm/hw_breakpoint.h> |
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6391485f342d..43b7c34f92cb 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c | |||
@@ -20,6 +20,7 @@ | |||
20 | 20 | ||
21 | #include <stdarg.h> | 21 | #include <stdarg.h> |
22 | 22 | ||
23 | #include <linux/compat.h> | ||
23 | #include <linux/export.h> | 24 | #include <linux/export.h> |
24 | #include <linux/sched.h> | 25 | #include <linux/sched.h> |
25 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
@@ -113,32 +114,62 @@ void arch_cpu_idle_dead(void) | |||
113 | } | 114 | } |
114 | #endif | 115 | #endif |
115 | 116 | ||
117 | /* | ||
118 | * Called by kexec, immediately prior to machine_kexec(). | ||
119 | * | ||
120 | * This must completely disable all secondary CPUs; simply causing those CPUs | ||
121 | * to execute e.g. a RAM-based pin loop is not sufficient. This allows the | ||
122 | * kexec'd kernel to use any and all RAM as it sees fit, without having to | ||
123 | * avoid any code or data used by any SW CPU pin loop. The CPU hotplug | ||
124 | * functionality embodied in disable_nonboot_cpus() to achieve this. | ||
125 | */ | ||
116 | void machine_shutdown(void) | 126 | void machine_shutdown(void) |
117 | { | 127 | { |
118 | #ifdef CONFIG_SMP | 128 | disable_nonboot_cpus(); |
119 | smp_send_stop(); | ||
120 | #endif | ||
121 | } | 129 | } |
122 | 130 | ||
131 | /* | ||
132 | * Halting simply requires that the secondary CPUs stop performing any | ||
133 | * activity (executing tasks, handling interrupts). smp_send_stop() | ||
134 | * achieves this. | ||
135 | */ | ||
123 | void machine_halt(void) | 136 | void machine_halt(void) |
124 | { | 137 | { |
125 | machine_shutdown(); | 138 | local_irq_disable(); |
139 | smp_send_stop(); | ||
126 | while (1); | 140 | while (1); |
127 | } | 141 | } |
128 | 142 | ||
143 | /* | ||
144 | * Power-off simply requires that the secondary CPUs stop performing any | ||
145 | * activity (executing tasks, handling interrupts). smp_send_stop() | ||
146 | * achieves this. When the system power is turned off, it will take all CPUs | ||
147 | * with it. | ||
148 | */ | ||
129 | void machine_power_off(void) | 149 | void machine_power_off(void) |
130 | { | 150 | { |
131 | machine_shutdown(); | 151 | local_irq_disable(); |
152 | smp_send_stop(); | ||
132 | if (pm_power_off) | 153 | if (pm_power_off) |
133 | pm_power_off(); | 154 | pm_power_off(); |
134 | } | 155 | } |
135 | 156 | ||
157 | /* | ||
158 | * Restart requires that the secondary CPUs stop performing any activity | ||
159 | * while the primary CPU resets the system. Systems with a single CPU can | ||
160 | * use soft_restart() as their machine descriptor's .restart hook, since that | ||
161 | * will cause the only available CPU to reset. Systems with multiple CPUs must | ||
162 | * provide a HW restart implementation, to ensure that all CPUs reset at once. | ||
163 | * This is required so that any code running after reset on the primary CPU | ||
164 | * doesn't have to co-ordinate with other CPUs to ensure they aren't still | ||
165 | * executing pre-reset code, and using RAM that the primary CPU's code wishes | ||
166 | * to use. Implementing such co-ordination would be essentially impossible. | ||
167 | */ | ||
136 | void machine_restart(char *cmd) | 168 | void machine_restart(char *cmd) |
137 | { | 169 | { |
138 | machine_shutdown(); | ||
139 | |||
140 | /* Disable interrupts first */ | 170 | /* Disable interrupts first */ |
141 | local_irq_disable(); | 171 | local_irq_disable(); |
172 | smp_send_stop(); | ||
142 | 173 | ||
143 | /* Now call the architecture specific reboot code. */ | 174 | /* Now call the architecture specific reboot code. */ |
144 | if (arm_pm_restart) | 175 | if (arm_pm_restart) |
@@ -205,7 +236,7 @@ void release_thread(struct task_struct *dead_task) | |||
205 | 236 | ||
206 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 237 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
207 | { | 238 | { |
208 | fpsimd_save_state(¤t->thread.fpsimd_state); | 239 | fpsimd_preserve_current_state(); |
209 | *dst = *src; | 240 | *dst = *src; |
210 | return 0; | 241 | return 0; |
211 | } | 242 | } |
@@ -300,7 +331,7 @@ struct task_struct *__switch_to(struct task_struct *prev, | |||
300 | * Complete any pending TLB or cache maintenance on this CPU in case | 331 | * Complete any pending TLB or cache maintenance on this CPU in case |
301 | * the thread migrates to a different CPU. | 332 | * the thread migrates to a different CPU. |
302 | */ | 333 | */ |
303 | dsb(); | 334 | dsb(ish); |
304 | 335 | ||
305 | /* the actual thread switch */ | 336 | /* the actual thread switch */ |
306 | last = cpu_switch_to(prev, next); | 337 | last = cpu_switch_to(prev, next); |
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 6a8928bba03c..3e926b9c0641 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c | |||
@@ -19,6 +19,7 @@ | |||
19 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/compat.h> | ||
22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
23 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
24 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
@@ -41,6 +42,9 @@ | |||
41 | #include <asm/traps.h> | 42 | #include <asm/traps.h> |
42 | #include <asm/system_misc.h> | 43 | #include <asm/system_misc.h> |
43 | 44 | ||
45 | #define CREATE_TRACE_POINTS | ||
46 | #include <trace/events/syscalls.h> | ||
47 | |||
44 | /* | 48 | /* |
45 | * TODO: does not yet catch signals sent when the child dies. | 49 | * TODO: does not yet catch signals sent when the child dies. |
46 | * in exit.c or in signal.c. | 50 | * in exit.c or in signal.c. |
@@ -517,6 +521,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, | |||
517 | return ret; | 521 | return ret; |
518 | 522 | ||
519 | target->thread.fpsimd_state.user_fpsimd = newstate; | 523 | target->thread.fpsimd_state.user_fpsimd = newstate; |
524 | fpsimd_flush_task_state(target); | ||
520 | return ret; | 525 | return ret; |
521 | } | 526 | } |
522 | 527 | ||
@@ -764,6 +769,7 @@ static int compat_vfp_set(struct task_struct *target, | |||
764 | uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; | 769 | uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; |
765 | } | 770 | } |
766 | 771 | ||
772 | fpsimd_flush_task_state(target); | ||
767 | return ret; | 773 | return ret; |
768 | } | 774 | } |
769 | 775 | ||
@@ -1058,35 +1064,49 @@ long arch_ptrace(struct task_struct *child, long request, | |||
1058 | return ptrace_request(child, request, addr, data); | 1064 | return ptrace_request(child, request, addr, data); |
1059 | } | 1065 | } |
1060 | 1066 | ||
1061 | asmlinkage int syscall_trace(int dir, struct pt_regs *regs) | 1067 | enum ptrace_syscall_dir { |
1068 | PTRACE_SYSCALL_ENTER = 0, | ||
1069 | PTRACE_SYSCALL_EXIT, | ||
1070 | }; | ||
1071 | |||
1072 | static void tracehook_report_syscall(struct pt_regs *regs, | ||
1073 | enum ptrace_syscall_dir dir) | ||
1062 | { | 1074 | { |
1075 | int regno; | ||
1063 | unsigned long saved_reg; | 1076 | unsigned long saved_reg; |
1064 | 1077 | ||
1065 | if (!test_thread_flag(TIF_SYSCALL_TRACE)) | 1078 | /* |
1066 | return regs->syscallno; | 1079 | * A scratch register (ip(r12) on AArch32, x7 on AArch64) is |
1067 | 1080 | * used to denote syscall entry/exit: | |
1068 | if (is_compat_task()) { | 1081 | */ |
1069 | /* AArch32 uses ip (r12) for scratch */ | 1082 | regno = (is_compat_task() ? 12 : 7); |
1070 | saved_reg = regs->regs[12]; | 1083 | saved_reg = regs->regs[regno]; |
1071 | regs->regs[12] = dir; | 1084 | regs->regs[regno] = dir; |
1072 | } else { | ||
1073 | /* | ||
1074 | * Save X7. X7 is used to denote syscall entry/exit: | ||
1075 | * X7 = 0 -> entry, = 1 -> exit | ||
1076 | */ | ||
1077 | saved_reg = regs->regs[7]; | ||
1078 | regs->regs[7] = dir; | ||
1079 | } | ||
1080 | 1085 | ||
1081 | if (dir) | 1086 | if (dir == PTRACE_SYSCALL_EXIT) |
1082 | tracehook_report_syscall_exit(regs, 0); | 1087 | tracehook_report_syscall_exit(regs, 0); |
1083 | else if (tracehook_report_syscall_entry(regs)) | 1088 | else if (tracehook_report_syscall_entry(regs)) |
1084 | regs->syscallno = ~0UL; | 1089 | regs->syscallno = ~0UL; |
1085 | 1090 | ||
1086 | if (is_compat_task()) | 1091 | regs->regs[regno] = saved_reg; |
1087 | regs->regs[12] = saved_reg; | 1092 | } |
1088 | else | 1093 | |
1089 | regs->regs[7] = saved_reg; | 1094 | asmlinkage int syscall_trace_enter(struct pt_regs *regs) |
1095 | { | ||
1096 | if (test_thread_flag(TIF_SYSCALL_TRACE)) | ||
1097 | tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); | ||
1098 | |||
1099 | if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) | ||
1100 | trace_sys_enter(regs, regs->syscallno); | ||
1090 | 1101 | ||
1091 | return regs->syscallno; | 1102 | return regs->syscallno; |
1092 | } | 1103 | } |
1104 | |||
1105 | asmlinkage void syscall_trace_exit(struct pt_regs *regs) | ||
1106 | { | ||
1107 | if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) | ||
1108 | trace_sys_exit(regs, regs_return_value(regs)); | ||
1109 | |||
1110 | if (test_thread_flag(TIF_SYSCALL_TRACE)) | ||
1111 | tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); | ||
1112 | } | ||
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c new file mode 100644 index 000000000000..89102a6ffad5 --- /dev/null +++ b/arch/arm64/kernel/return_address.c | |||
@@ -0,0 +1,55 @@ | |||
1 | /* | ||
2 | * arch/arm64/kernel/return_address.c | ||
3 | * | ||
4 | * Copyright (C) 2013 Linaro Limited | ||
5 | * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | |||
12 | #include <linux/export.h> | ||
13 | #include <linux/ftrace.h> | ||
14 | |||
15 | #include <asm/stacktrace.h> | ||
16 | |||
17 | struct return_address_data { | ||
18 | unsigned int level; | ||
19 | void *addr; | ||
20 | }; | ||
21 | |||
22 | static int save_return_addr(struct stackframe *frame, void *d) | ||
23 | { | ||
24 | struct return_address_data *data = d; | ||
25 | |||
26 | if (!data->level) { | ||
27 | data->addr = (void *)frame->pc; | ||
28 | return 1; | ||
29 | } else { | ||
30 | --data->level; | ||
31 | return 0; | ||
32 | } | ||
33 | } | ||
34 | |||
35 | void *return_address(unsigned int level) | ||
36 | { | ||
37 | struct return_address_data data; | ||
38 | struct stackframe frame; | ||
39 | register unsigned long current_sp asm ("sp"); | ||
40 | |||
41 | data.level = level + 2; | ||
42 | data.addr = NULL; | ||
43 | |||
44 | frame.fp = (unsigned long)__builtin_frame_address(0); | ||
45 | frame.sp = current_sp; | ||
46 | frame.pc = (unsigned long)return_address; /* dummy */ | ||
47 | |||
48 | walk_stackframe(&frame, save_return_addr, &data); | ||
49 | |||
50 | if (!data.level) | ||
51 | return data.addr; | ||
52 | else | ||
53 | return NULL; | ||
54 | } | ||
55 | EXPORT_SYMBOL_GPL(return_address); | ||
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index e578171b22ff..46d1125571f6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/utsname.h> | 25 | #include <linux/utsname.h> |
26 | #include <linux/initrd.h> | 26 | #include <linux/initrd.h> |
27 | #include <linux/console.h> | 27 | #include <linux/console.h> |
28 | #include <linux/cache.h> | ||
28 | #include <linux/bootmem.h> | 29 | #include <linux/bootmem.h> |
29 | #include <linux/seq_file.h> | 30 | #include <linux/seq_file.h> |
30 | #include <linux/screen_info.h> | 31 | #include <linux/screen_info.h> |
@@ -200,6 +201,8 @@ static void __init setup_processor(void) | |||
200 | { | 201 | { |
201 | struct cpu_info *cpu_info; | 202 | struct cpu_info *cpu_info; |
202 | u64 features, block; | 203 | u64 features, block; |
204 | u32 cwg; | ||
205 | int cls; | ||
203 | 206 | ||
204 | cpu_info = lookup_processor_type(read_cpuid_id()); | 207 | cpu_info = lookup_processor_type(read_cpuid_id()); |
205 | if (!cpu_info) { | 208 | if (!cpu_info) { |
@@ -217,6 +220,18 @@ static void __init setup_processor(void) | |||
217 | elf_hwcap = 0; | 220 | elf_hwcap = 0; |
218 | 221 | ||
219 | /* | 222 | /* |
223 | * Check for sane CTR_EL0.CWG value. | ||
224 | */ | ||
225 | cwg = cache_type_cwg(); | ||
226 | cls = cache_line_size(); | ||
227 | if (!cwg) | ||
228 | pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n", | ||
229 | cls); | ||
230 | if (L1_CACHE_BYTES < cls) | ||
231 | pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", | ||
232 | L1_CACHE_BYTES, cls); | ||
233 | |||
234 | /* | ||
220 | * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks. | 235 | * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks. |
221 | * The blocks we test below represent incremental functionality | 236 | * The blocks we test below represent incremental functionality |
222 | * for non-negative values. Negative values are reserved. | 237 | * for non-negative values. Negative values are reserved. |
@@ -363,7 +378,6 @@ void __init setup_arch(char **cmdline_p) | |||
363 | 378 | ||
364 | *cmdline_p = boot_command_line; | 379 | *cmdline_p = boot_command_line; |
365 | 380 | ||
366 | init_mem_pgprot(); | ||
367 | early_ioremap_init(); | 381 | early_ioremap_init(); |
368 | 382 | ||
369 | parse_early_param(); | 383 | parse_early_param(); |
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 890a591f75dd..6357b9c6c90e 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c | |||
@@ -17,6 +17,7 @@ | |||
17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | 17 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include <linux/compat.h> | ||
20 | #include <linux/errno.h> | 21 | #include <linux/errno.h> |
21 | #include <linux/signal.h> | 22 | #include <linux/signal.h> |
22 | #include <linux/personality.h> | 23 | #include <linux/personality.h> |
@@ -25,7 +26,6 @@ | |||
25 | #include <linux/tracehook.h> | 26 | #include <linux/tracehook.h> |
26 | #include <linux/ratelimit.h> | 27 | #include <linux/ratelimit.h> |
27 | 28 | ||
28 | #include <asm/compat.h> | ||
29 | #include <asm/debug-monitors.h> | 29 | #include <asm/debug-monitors.h> |
30 | #include <asm/elf.h> | 30 | #include <asm/elf.h> |
31 | #include <asm/cacheflush.h> | 31 | #include <asm/cacheflush.h> |
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) | |||
51 | int err; | 51 | int err; |
52 | 52 | ||
53 | /* dump the hardware registers to the fpsimd_state structure */ | 53 | /* dump the hardware registers to the fpsimd_state structure */ |
54 | fpsimd_save_state(fpsimd); | 54 | fpsimd_preserve_current_state(); |
55 | 55 | ||
56 | /* copy the FP and status/control registers */ | 56 | /* copy the FP and status/control registers */ |
57 | err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); | 57 | err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); |
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) | |||
86 | __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); | 86 | __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); |
87 | 87 | ||
88 | /* load the hardware registers from the fpsimd_state structure */ | 88 | /* load the hardware registers from the fpsimd_state structure */ |
89 | if (!err) { | 89 | if (!err) |
90 | preempt_disable(); | 90 | fpsimd_update_current_state(&fpsimd); |
91 | fpsimd_load_state(&fpsimd); | ||
92 | preempt_enable(); | ||
93 | } | ||
94 | 91 | ||
95 | return err ? -EFAULT : 0; | 92 | return err ? -EFAULT : 0; |
96 | } | 93 | } |
@@ -100,8 +97,7 @@ static int restore_sigframe(struct pt_regs *regs, | |||
100 | { | 97 | { |
101 | sigset_t set; | 98 | sigset_t set; |
102 | int i, err; | 99 | int i, err; |
103 | struct aux_context __user *aux = | 100 | void *aux = sf->uc.uc_mcontext.__reserved; |
104 | (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; | ||
105 | 101 | ||
106 | err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); | 102 | err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); |
107 | if (err == 0) | 103 | if (err == 0) |
@@ -121,8 +117,11 @@ static int restore_sigframe(struct pt_regs *regs, | |||
121 | 117 | ||
122 | err |= !valid_user_regs(®s->user_regs); | 118 | err |= !valid_user_regs(®s->user_regs); |
123 | 119 | ||
124 | if (err == 0) | 120 | if (err == 0) { |
125 | err |= restore_fpsimd_context(&aux->fpsimd); | 121 | struct fpsimd_context *fpsimd_ctx = |
122 | container_of(aux, struct fpsimd_context, head); | ||
123 | err |= restore_fpsimd_context(fpsimd_ctx); | ||
124 | } | ||
126 | 125 | ||
127 | return err; | 126 | return err; |
128 | } | 127 | } |
@@ -167,8 +166,8 @@ static int setup_sigframe(struct rt_sigframe __user *sf, | |||
167 | struct pt_regs *regs, sigset_t *set) | 166 | struct pt_regs *regs, sigset_t *set) |
168 | { | 167 | { |
169 | int i, err = 0; | 168 | int i, err = 0; |
170 | struct aux_context __user *aux = | 169 | void *aux = sf->uc.uc_mcontext.__reserved; |
171 | (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; | 170 | struct _aarch64_ctx *end; |
172 | 171 | ||
173 | /* set up the stack frame for unwinding */ | 172 | /* set up the stack frame for unwinding */ |
174 | __put_user_error(regs->regs[29], &sf->fp, err); | 173 | __put_user_error(regs->regs[29], &sf->fp, err); |
@@ -185,12 +184,27 @@ static int setup_sigframe(struct rt_sigframe __user *sf, | |||
185 | 184 | ||
186 | err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); | 185 | err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); |
187 | 186 | ||
188 | if (err == 0) | 187 | if (err == 0) { |
189 | err |= preserve_fpsimd_context(&aux->fpsimd); | 188 | struct fpsimd_context *fpsimd_ctx = |
189 | container_of(aux, struct fpsimd_context, head); | ||
190 | err |= preserve_fpsimd_context(fpsimd_ctx); | ||
191 | aux += sizeof(*fpsimd_ctx); | ||
192 | } | ||
193 | |||
194 | /* fault information, if valid */ | ||
195 | if (current->thread.fault_code) { | ||
196 | struct esr_context *esr_ctx = | ||
197 | container_of(aux, struct esr_context, head); | ||
198 | __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); | ||
199 | __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); | ||
200 | __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); | ||
201 | aux += sizeof(*esr_ctx); | ||
202 | } | ||
190 | 203 | ||
191 | /* set the "end" magic */ | 204 | /* set the "end" magic */ |
192 | __put_user_error(0, &aux->end.magic, err); | 205 | end = aux; |
193 | __put_user_error(0, &aux->end.size, err); | 206 | __put_user_error(0, &end->magic, err); |
207 | __put_user_error(0, &end->size, err); | ||
194 | 208 | ||
195 | return err; | 209 | return err; |
196 | } | 210 | } |
@@ -416,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, | |||
416 | clear_thread_flag(TIF_NOTIFY_RESUME); | 430 | clear_thread_flag(TIF_NOTIFY_RESUME); |
417 | tracehook_notify_resume(regs); | 431 | tracehook_notify_resume(regs); |
418 | } | 432 | } |
433 | |||
434 | if (thread_flags & _TIF_FOREIGN_FPSTATE) | ||
435 | fpsimd_restore_current_state(); | ||
436 | |||
419 | } | 437 | } |
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index b3fc9f5ec6d3..3491c638f172 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
24 | #include <linux/ratelimit.h> | 24 | #include <linux/ratelimit.h> |
25 | 25 | ||
26 | #include <asm/esr.h> | ||
26 | #include <asm/fpsimd.h> | 27 | #include <asm/fpsimd.h> |
27 | #include <asm/signal32.h> | 28 | #include <asm/signal32.h> |
28 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
@@ -81,6 +82,8 @@ struct compat_vfp_sigframe { | |||
81 | #define VFP_MAGIC 0x56465001 | 82 | #define VFP_MAGIC 0x56465001 |
82 | #define VFP_STORAGE_SIZE sizeof(struct compat_vfp_sigframe) | 83 | #define VFP_STORAGE_SIZE sizeof(struct compat_vfp_sigframe) |
83 | 84 | ||
85 | #define FSR_WRITE_SHIFT (11) | ||
86 | |||
84 | struct compat_aux_sigframe { | 87 | struct compat_aux_sigframe { |
85 | struct compat_vfp_sigframe vfp; | 88 | struct compat_vfp_sigframe vfp; |
86 | 89 | ||
@@ -219,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) | |||
219 | * Note that this also saves V16-31, which aren't visible | 222 | * Note that this also saves V16-31, which aren't visible |
220 | * in AArch32. | 223 | * in AArch32. |
221 | */ | 224 | */ |
222 | fpsimd_save_state(fpsimd); | 225 | fpsimd_preserve_current_state(); |
223 | 226 | ||
224 | /* Place structure header on the stack */ | 227 | /* Place structure header on the stack */ |
225 | __put_user_error(magic, &frame->magic, err); | 228 | __put_user_error(magic, &frame->magic, err); |
@@ -282,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) | |||
282 | * We don't need to touch the exception register, so | 285 | * We don't need to touch the exception register, so |
283 | * reload the hardware state. | 286 | * reload the hardware state. |
284 | */ | 287 | */ |
285 | if (!err) { | 288 | if (!err) |
286 | preempt_disable(); | 289 | fpsimd_update_current_state(&fpsimd); |
287 | fpsimd_load_state(&fpsimd); | ||
288 | preempt_enable(); | ||
289 | } | ||
290 | 290 | ||
291 | return err ? -EFAULT : 0; | 291 | return err ? -EFAULT : 0; |
292 | } | 292 | } |
@@ -500,7 +500,9 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, | |||
500 | __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); | 500 | __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); |
501 | 501 | ||
502 | __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err); | 502 | __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err); |
503 | __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.error_code, err); | 503 | /* set the compat FSR WnR */ |
504 | __put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) << | ||
505 | FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err); | ||
504 | __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); | 506 | __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); |
505 | __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err); | 507 | __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err); |
506 | 508 | ||
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index c3cb160edc69..40f38f46c8e0 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/clockchips.h> | 35 | #include <linux/clockchips.h> |
36 | #include <linux/completion.h> | 36 | #include <linux/completion.h> |
37 | #include <linux/of.h> | 37 | #include <linux/of.h> |
38 | #include <linux/irq_work.h> | ||
38 | 39 | ||
39 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
40 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
@@ -62,6 +63,7 @@ enum ipi_msg_type { | |||
62 | IPI_CALL_FUNC_SINGLE, | 63 | IPI_CALL_FUNC_SINGLE, |
63 | IPI_CPU_STOP, | 64 | IPI_CPU_STOP, |
64 | IPI_TIMER, | 65 | IPI_TIMER, |
66 | IPI_IRQ_WORK, | ||
65 | }; | 67 | }; |
66 | 68 | ||
67 | /* | 69 | /* |
@@ -477,6 +479,14 @@ void arch_send_call_function_single_ipi(int cpu) | |||
477 | smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); | 479 | smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); |
478 | } | 480 | } |
479 | 481 | ||
482 | #ifdef CONFIG_IRQ_WORK | ||
483 | void arch_irq_work_raise(void) | ||
484 | { | ||
485 | if (smp_cross_call) | ||
486 | smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); | ||
487 | } | ||
488 | #endif | ||
489 | |||
480 | static const char *ipi_types[NR_IPI] = { | 490 | static const char *ipi_types[NR_IPI] = { |
481 | #define S(x,s) [x - IPI_RESCHEDULE] = s | 491 | #define S(x,s) [x - IPI_RESCHEDULE] = s |
482 | S(IPI_RESCHEDULE, "Rescheduling interrupts"), | 492 | S(IPI_RESCHEDULE, "Rescheduling interrupts"), |
@@ -484,6 +494,7 @@ static const char *ipi_types[NR_IPI] = { | |||
484 | S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), | 494 | S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), |
485 | S(IPI_CPU_STOP, "CPU stop interrupts"), | 495 | S(IPI_CPU_STOP, "CPU stop interrupts"), |
486 | S(IPI_TIMER, "Timer broadcast interrupts"), | 496 | S(IPI_TIMER, "Timer broadcast interrupts"), |
497 | S(IPI_IRQ_WORK, "IRQ work interrupts"), | ||
487 | }; | 498 | }; |
488 | 499 | ||
489 | void show_ipi_list(struct seq_file *p, int prec) | 500 | void show_ipi_list(struct seq_file *p, int prec) |
@@ -576,6 +587,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs) | |||
576 | break; | 587 | break; |
577 | #endif | 588 | #endif |
578 | 589 | ||
590 | #ifdef CONFIG_IRQ_WORK | ||
591 | case IPI_IRQ_WORK: | ||
592 | irq_enter(); | ||
593 | irq_work_run(); | ||
594 | irq_exit(); | ||
595 | break; | ||
596 | #endif | ||
597 | |||
579 | default: | 598 | default: |
580 | pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); | 599 | pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); |
581 | break; | 600 | break; |
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index 7a530d2cc807..0347d38eea29 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c | |||
@@ -30,7 +30,6 @@ extern void secondary_holding_pen(void); | |||
30 | volatile unsigned long secondary_holding_pen_release = INVALID_HWID; | 30 | volatile unsigned long secondary_holding_pen_release = INVALID_HWID; |
31 | 31 | ||
32 | static phys_addr_t cpu_release_addr[NR_CPUS]; | 32 | static phys_addr_t cpu_release_addr[NR_CPUS]; |
33 | static DEFINE_RAW_SPINLOCK(boot_lock); | ||
34 | 33 | ||
35 | /* | 34 | /* |
36 | * Write secondary_holding_pen_release in a way that is guaranteed to be | 35 | * Write secondary_holding_pen_release in a way that is guaranteed to be |
@@ -94,14 +93,6 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) | |||
94 | 93 | ||
95 | static int smp_spin_table_cpu_boot(unsigned int cpu) | 94 | static int smp_spin_table_cpu_boot(unsigned int cpu) |
96 | { | 95 | { |
97 | unsigned long timeout; | ||
98 | |||
99 | /* | ||
100 | * Set synchronisation state between this boot processor | ||
101 | * and the secondary one | ||
102 | */ | ||
103 | raw_spin_lock(&boot_lock); | ||
104 | |||
105 | /* | 96 | /* |
106 | * Update the pen release flag. | 97 | * Update the pen release flag. |
107 | */ | 98 | */ |
@@ -112,34 +103,7 @@ static int smp_spin_table_cpu_boot(unsigned int cpu) | |||
112 | */ | 103 | */ |
113 | sev(); | 104 | sev(); |
114 | 105 | ||
115 | timeout = jiffies + (1 * HZ); | 106 | return 0; |
116 | while (time_before(jiffies, timeout)) { | ||
117 | if (secondary_holding_pen_release == INVALID_HWID) | ||
118 | break; | ||
119 | udelay(10); | ||
120 | } | ||
121 | |||
122 | /* | ||
123 | * Now the secondary core is starting up let it run its | ||
124 | * calibrations, then wait for it to finish | ||
125 | */ | ||
126 | raw_spin_unlock(&boot_lock); | ||
127 | |||
128 | return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0; | ||
129 | } | ||
130 | |||
131 | static void smp_spin_table_cpu_postboot(void) | ||
132 | { | ||
133 | /* | ||
134 | * Let the primary processor know we're out of the pen. | ||
135 | */ | ||
136 | write_pen_release(INVALID_HWID); | ||
137 | |||
138 | /* | ||
139 | * Synchronise with the boot thread. | ||
140 | */ | ||
141 | raw_spin_lock(&boot_lock); | ||
142 | raw_spin_unlock(&boot_lock); | ||
143 | } | 107 | } |
144 | 108 | ||
145 | const struct cpu_operations smp_spin_table_ops = { | 109 | const struct cpu_operations smp_spin_table_ops = { |
@@ -147,5 +111,4 @@ const struct cpu_operations smp_spin_table_ops = { | |||
147 | .cpu_init = smp_spin_table_cpu_init, | 111 | .cpu_init = smp_spin_table_cpu_init, |
148 | .cpu_prepare = smp_spin_table_cpu_prepare, | 112 | .cpu_prepare = smp_spin_table_cpu_prepare, |
149 | .cpu_boot = smp_spin_table_cpu_boot, | 113 | .cpu_boot = smp_spin_table_cpu_boot, |
150 | .cpu_postboot = smp_spin_table_cpu_postboot, | ||
151 | }; | 114 | }; |
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 38f0558f0c0a..55437ba1f5a4 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c | |||
@@ -35,7 +35,7 @@ | |||
35 | * ldp x29, x30, [sp] | 35 | * ldp x29, x30, [sp] |
36 | * add sp, sp, #0x10 | 36 | * add sp, sp, #0x10 |
37 | */ | 37 | */ |
38 | int unwind_frame(struct stackframe *frame) | 38 | int notrace unwind_frame(struct stackframe *frame) |
39 | { | 39 | { |
40 | unsigned long high, low; | 40 | unsigned long high, low; |
41 | unsigned long fp = frame->fp; | 41 | unsigned long fp = frame->fp; |
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 6815987b50f8..1a7125c3099b 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c | |||
@@ -18,6 +18,7 @@ | |||
18 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/clockchips.h> | ||
21 | #include <linux/export.h> | 22 | #include <linux/export.h> |
22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
23 | #include <linux/interrupt.h> | 24 | #include <linux/interrupt.h> |
@@ -69,6 +70,8 @@ void __init time_init(void) | |||
69 | of_clk_init(NULL); | 70 | of_clk_init(NULL); |
70 | clocksource_of_init(); | 71 | clocksource_of_init(); |
71 | 72 | ||
73 | tick_setup_hrtimer_broadcast(); | ||
74 | |||
72 | arch_timer_rate = arch_timer_get_rate(); | 75 | arch_timer_rate = arch_timer_get_rate(); |
73 | if (!arch_timer_rate) | 76 | if (!arch_timer_rate) |
74 | panic("Unable to initialise architected timer.\n"); | 77 | panic("Unable to initialise architected timer.\n"); |
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3e06b0be4ec8..43514f905916 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c | |||
@@ -17,10 +17,192 @@ | |||
17 | #include <linux/percpu.h> | 17 | #include <linux/percpu.h> |
18 | #include <linux/node.h> | 18 | #include <linux/node.h> |
19 | #include <linux/nodemask.h> | 19 | #include <linux/nodemask.h> |
20 | #include <linux/of.h> | ||
20 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
21 | 22 | ||
22 | #include <asm/topology.h> | 23 | #include <asm/topology.h> |
23 | 24 | ||
25 | static int __init get_cpu_for_node(struct device_node *node) | ||
26 | { | ||
27 | struct device_node *cpu_node; | ||
28 | int cpu; | ||
29 | |||
30 | cpu_node = of_parse_phandle(node, "cpu", 0); | ||
31 | if (!cpu_node) | ||
32 | return -1; | ||
33 | |||
34 | for_each_possible_cpu(cpu) { | ||
35 | if (of_get_cpu_node(cpu, NULL) == cpu_node) { | ||
36 | of_node_put(cpu_node); | ||
37 | return cpu; | ||
38 | } | ||
39 | } | ||
40 | |||
41 | pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name); | ||
42 | |||
43 | of_node_put(cpu_node); | ||
44 | return -1; | ||
45 | } | ||
46 | |||
47 | static int __init parse_core(struct device_node *core, int cluster_id, | ||
48 | int core_id) | ||
49 | { | ||
50 | char name[10]; | ||
51 | bool leaf = true; | ||
52 | int i = 0; | ||
53 | int cpu; | ||
54 | struct device_node *t; | ||
55 | |||
56 | do { | ||
57 | snprintf(name, sizeof(name), "thread%d", i); | ||
58 | t = of_get_child_by_name(core, name); | ||
59 | if (t) { | ||
60 | leaf = false; | ||
61 | cpu = get_cpu_for_node(t); | ||
62 | if (cpu >= 0) { | ||
63 | cpu_topology[cpu].cluster_id = cluster_id; | ||
64 | cpu_topology[cpu].core_id = core_id; | ||
65 | cpu_topology[cpu].thread_id = i; | ||
66 | } else { | ||
67 | pr_err("%s: Can't get CPU for thread\n", | ||
68 | t->full_name); | ||
69 | of_node_put(t); | ||
70 | return -EINVAL; | ||
71 | } | ||
72 | of_node_put(t); | ||
73 | } | ||
74 | i++; | ||
75 | } while (t); | ||
76 | |||
77 | cpu = get_cpu_for_node(core); | ||
78 | if (cpu >= 0) { | ||
79 | if (!leaf) { | ||
80 | pr_err("%s: Core has both threads and CPU\n", | ||
81 | core->full_name); | ||
82 | return -EINVAL; | ||
83 | } | ||
84 | |||
85 | cpu_topology[cpu].cluster_id = cluster_id; | ||
86 | cpu_topology[cpu].core_id = core_id; | ||
87 | } else if (leaf) { | ||
88 | pr_err("%s: Can't get CPU for leaf core\n", core->full_name); | ||
89 | return -EINVAL; | ||
90 | } | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | static int __init parse_cluster(struct device_node *cluster, int depth) | ||
96 | { | ||
97 | char name[10]; | ||
98 | bool leaf = true; | ||
99 | bool has_cores = false; | ||
100 | struct device_node *c; | ||
101 | static int cluster_id __initdata; | ||
102 | int core_id = 0; | ||
103 | int i, ret; | ||
104 | |||
105 | /* | ||
106 | * First check for child clusters; we currently ignore any | ||
107 | * information about the nesting of clusters and present the | ||
108 | * scheduler with a flat list of them. | ||
109 | */ | ||
110 | i = 0; | ||
111 | do { | ||
112 | snprintf(name, sizeof(name), "cluster%d", i); | ||
113 | c = of_get_child_by_name(cluster, name); | ||
114 | if (c) { | ||
115 | leaf = false; | ||
116 | ret = parse_cluster(c, depth + 1); | ||
117 | of_node_put(c); | ||
118 | if (ret != 0) | ||
119 | return ret; | ||
120 | } | ||
121 | i++; | ||
122 | } while (c); | ||
123 | |||
124 | /* Now check for cores */ | ||
125 | i = 0; | ||
126 | do { | ||
127 | snprintf(name, sizeof(name), "core%d", i); | ||
128 | c = of_get_child_by_name(cluster, name); | ||
129 | if (c) { | ||
130 | has_cores = true; | ||
131 | |||
132 | if (depth == 0) { | ||
133 | pr_err("%s: cpu-map children should be clusters\n", | ||
134 | c->full_name); | ||
135 | of_node_put(c); | ||
136 | return -EINVAL; | ||
137 | } | ||
138 | |||
139 | if (leaf) { | ||
140 | ret = parse_core(c, cluster_id, core_id++); | ||
141 | } else { | ||
142 | pr_err("%s: Non-leaf cluster with core %s\n", | ||
143 | cluster->full_name, name); | ||
144 | ret = -EINVAL; | ||
145 | } | ||
146 | |||
147 | of_node_put(c); | ||
148 | if (ret != 0) | ||
149 | return ret; | ||
150 | } | ||
151 | i++; | ||
152 | } while (c); | ||
153 | |||
154 | if (leaf && !has_cores) | ||
155 | pr_warn("%s: empty cluster\n", cluster->full_name); | ||
156 | |||
157 | if (leaf) | ||
158 | cluster_id++; | ||
159 | |||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | static int __init parse_dt_topology(void) | ||
164 | { | ||
165 | struct device_node *cn, *map; | ||
166 | int ret = 0; | ||
167 | int cpu; | ||
168 | |||
169 | cn = of_find_node_by_path("/cpus"); | ||
170 | if (!cn) { | ||
171 | pr_err("No CPU information found in DT\n"); | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * When topology is provided cpu-map is essentially a root | ||
177 | * cluster with restricted subnodes. | ||
178 | */ | ||
179 | map = of_get_child_by_name(cn, "cpu-map"); | ||
180 | if (!map) | ||
181 | goto out; | ||
182 | |||
183 | ret = parse_cluster(map, 0); | ||
184 | if (ret != 0) | ||
185 | goto out_map; | ||
186 | |||
187 | /* | ||
188 | * Check that all cores are in the topology; the SMP code will | ||
189 | * only mark cores described in the DT as possible. | ||
190 | */ | ||
191 | for_each_possible_cpu(cpu) { | ||
192 | if (cpu_topology[cpu].cluster_id == -1) { | ||
193 | pr_err("CPU%d: No topology information specified\n", | ||
194 | cpu); | ||
195 | ret = -EINVAL; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | out_map: | ||
200 | of_node_put(map); | ||
201 | out: | ||
202 | of_node_put(cn); | ||
203 | return ret; | ||
204 | } | ||
205 | |||
24 | /* | 206 | /* |
25 | * cpu topology table | 207 | * cpu topology table |
26 | */ | 208 | */ |
@@ -39,13 +221,9 @@ static void update_siblings_masks(unsigned int cpuid) | |||
39 | 221 | ||
40 | if (cpuid_topo->cluster_id == -1) { | 222 | if (cpuid_topo->cluster_id == -1) { |
41 | /* | 223 | /* |
42 | * DT does not contain topology information for this cpu | 224 | * DT does not contain topology information for this cpu. |
43 | * reset it to default behaviour | ||
44 | */ | 225 | */ |
45 | pr_debug("CPU%u: No topology information configured\n", cpuid); | 226 | pr_debug("CPU%u: No topology information configured\n", cpuid); |
46 | cpuid_topo->core_id = 0; | ||
47 | cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling); | ||
48 | cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling); | ||
49 | return; | 227 | return; |
50 | } | 228 | } |
51 | 229 | ||
@@ -74,22 +252,32 @@ void store_cpu_topology(unsigned int cpuid) | |||
74 | update_siblings_masks(cpuid); | 252 | update_siblings_masks(cpuid); |
75 | } | 253 | } |
76 | 254 | ||
77 | /* | 255 | static void __init reset_cpu_topology(void) |
78 | * init_cpu_topology is called at boot when only one cpu is running | ||
79 | * which prevent simultaneous write access to cpu_topology array | ||
80 | */ | ||
81 | void __init init_cpu_topology(void) | ||
82 | { | 256 | { |
83 | unsigned int cpu; | 257 | unsigned int cpu; |
84 | 258 | ||
85 | /* init core mask and power*/ | ||
86 | for_each_possible_cpu(cpu) { | 259 | for_each_possible_cpu(cpu) { |
87 | struct cpu_topology *cpu_topo = &cpu_topology[cpu]; | 260 | struct cpu_topology *cpu_topo = &cpu_topology[cpu]; |
88 | 261 | ||
89 | cpu_topo->thread_id = -1; | 262 | cpu_topo->thread_id = -1; |
90 | cpu_topo->core_id = -1; | 263 | cpu_topo->core_id = 0; |
91 | cpu_topo->cluster_id = -1; | 264 | cpu_topo->cluster_id = -1; |
265 | |||
92 | cpumask_clear(&cpu_topo->core_sibling); | 266 | cpumask_clear(&cpu_topo->core_sibling); |
267 | cpumask_set_cpu(cpu, &cpu_topo->core_sibling); | ||
93 | cpumask_clear(&cpu_topo->thread_sibling); | 268 | cpumask_clear(&cpu_topo->thread_sibling); |
269 | cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); | ||
94 | } | 270 | } |
95 | } | 271 | } |
272 | |||
273 | void __init init_cpu_topology(void) | ||
274 | { | ||
275 | reset_cpu_topology(); | ||
276 | |||
277 | /* | ||
278 | * Discard anything that was parsed if we hit an error so we | ||
279 | * don't use partial information. | ||
280 | */ | ||
281 | if (parse_dt_topology()) | ||
282 | reset_cpu_topology(); | ||
283 | } | ||
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 7ffadddb645d..c43cfa9b8304 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c | |||
@@ -251,10 +251,13 @@ void die(const char *str, struct pt_regs *regs, int err) | |||
251 | void arm64_notify_die(const char *str, struct pt_regs *regs, | 251 | void arm64_notify_die(const char *str, struct pt_regs *regs, |
252 | struct siginfo *info, int err) | 252 | struct siginfo *info, int err) |
253 | { | 253 | { |
254 | if (user_mode(regs)) | 254 | if (user_mode(regs)) { |
255 | current->thread.fault_address = 0; | ||
256 | current->thread.fault_code = err; | ||
255 | force_sig_info(info->si_signo, info, current); | 257 | force_sig_info(info->si_signo, info, current); |
256 | else | 258 | } else { |
257 | die(str, regs, err); | 259 | die(str, regs, err); |
260 | } | ||
258 | } | 261 | } |
259 | 262 | ||
260 | asmlinkage void __exception do_undefinstr(struct pt_regs *regs) | 263 | asmlinkage void __exception do_undefinstr(struct pt_regs *regs) |
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 4ba7a55b49c7..f1e6d5c032e1 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S | |||
@@ -13,7 +13,7 @@ | |||
13 | #define ARM_EXIT_DISCARD(x) x | 13 | #define ARM_EXIT_DISCARD(x) x |
14 | 14 | ||
15 | OUTPUT_ARCH(aarch64) | 15 | OUTPUT_ARCH(aarch64) |
16 | ENTRY(stext) | 16 | ENTRY(_text) |
17 | 17 | ||
18 | jiffies = jiffies_64; | 18 | jiffies = jiffies_64; |
19 | 19 | ||
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 2c56012cb2d2..b0d1512acf08 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S | |||
@@ -630,9 +630,15 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) | |||
630 | * whole of Stage-1. Weep... | 630 | * whole of Stage-1. Weep... |
631 | */ | 631 | */ |
632 | tlbi ipas2e1is, x1 | 632 | tlbi ipas2e1is, x1 |
633 | dsb sy | 633 | /* |
634 | * We have to ensure completion of the invalidation at Stage-2, | ||
635 | * since a table walk on another CPU could refill a TLB with a | ||
636 | * complete (S1 + S2) walk based on the old Stage-2 mapping if | ||
637 | * the Stage-1 invalidation happened first. | ||
638 | */ | ||
639 | dsb ish | ||
634 | tlbi vmalle1is | 640 | tlbi vmalle1is |
635 | dsb sy | 641 | dsb ish |
636 | isb | 642 | isb |
637 | 643 | ||
638 | msr vttbr_el2, xzr | 644 | msr vttbr_el2, xzr |
@@ -643,7 +649,7 @@ ENTRY(__kvm_flush_vm_context) | |||
643 | dsb ishst | 649 | dsb ishst |
644 | tlbi alle1is | 650 | tlbi alle1is |
645 | ic ialluis | 651 | ic ialluis |
646 | dsb sy | 652 | dsb ish |
647 | ret | 653 | ret |
648 | ENDPROC(__kvm_flush_vm_context) | 654 | ENDPROC(__kvm_flush_vm_context) |
649 | 655 | ||
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 03244582bc55..c59a1bdab5eb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c | |||
@@ -71,13 +71,13 @@ static u32 get_ccsidr(u32 csselr) | |||
71 | static void do_dc_cisw(u32 val) | 71 | static void do_dc_cisw(u32 val) |
72 | { | 72 | { |
73 | asm volatile("dc cisw, %x0" : : "r" (val)); | 73 | asm volatile("dc cisw, %x0" : : "r" (val)); |
74 | dsb(); | 74 | dsb(ish); |
75 | } | 75 | } |
76 | 76 | ||
77 | static void do_dc_csw(u32 val) | 77 | static void do_dc_csw(u32 val) |
78 | { | 78 | { |
79 | asm volatile("dc csw, %x0" : : "r" (val)); | 79 | asm volatile("dc csw, %x0" : : "r" (val)); |
80 | dsb(); | 80 | dsb(ish); |
81 | } | 81 | } |
82 | 82 | ||
83 | /* See note at ARM ARM B1.14.4 */ | 83 | /* See note at ARM ARM B1.14.4 */ |
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 328ce1a99daa..d98d3e39879e 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile | |||
@@ -1,4 +1,5 @@ | |||
1 | lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ | 1 | lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ |
2 | copy_to_user.o copy_in_user.o copy_page.o \ | 2 | copy_to_user.o copy_in_user.o copy_page.o \ |
3 | clear_page.o memchr.o memcpy.o memmove.o memset.o \ | 3 | clear_page.o memchr.o memcpy.o memmove.o memset.o \ |
4 | memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ | ||
4 | strchr.o strrchr.o | 5 | strchr.o strrchr.o |
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 000000000000..6ea0776ba6de --- /dev/null +++ b/arch/arm64/lib/memcmp.S | |||
@@ -0,0 +1,258 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2013 ARM Ltd. | ||
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License version 2 as | ||
14 | * published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
23 | */ | ||
24 | |||
25 | #include <linux/linkage.h> | ||
26 | #include <asm/assembler.h> | ||
27 | |||
28 | /* | ||
29 | * compare memory areas(when two memory areas' offset are different, | ||
30 | * alignment handled by the hardware) | ||
31 | * | ||
32 | * Parameters: | ||
33 | * x0 - const memory area 1 pointer | ||
34 | * x1 - const memory area 2 pointer | ||
35 | * x2 - the maximal compare byte length | ||
36 | * Returns: | ||
37 | * x0 - a compare result, maybe less than, equal to, or greater than ZERO | ||
38 | */ | ||
39 | |||
40 | /* Parameters and result. */ | ||
41 | src1 .req x0 | ||
42 | src2 .req x1 | ||
43 | limit .req x2 | ||
44 | result .req x0 | ||
45 | |||
46 | /* Internal variables. */ | ||
47 | data1 .req x3 | ||
48 | data1w .req w3 | ||
49 | data2 .req x4 | ||
50 | data2w .req w4 | ||
51 | has_nul .req x5 | ||
52 | diff .req x6 | ||
53 | endloop .req x7 | ||
54 | tmp1 .req x8 | ||
55 | tmp2 .req x9 | ||
56 | tmp3 .req x10 | ||
57 | pos .req x11 | ||
58 | limit_wd .req x12 | ||
59 | mask .req x13 | ||
60 | |||
61 | ENTRY(memcmp) | ||
62 | cbz limit, .Lret0 | ||
63 | eor tmp1, src1, src2 | ||
64 | tst tmp1, #7 | ||
65 | b.ne .Lmisaligned8 | ||
66 | ands tmp1, src1, #7 | ||
67 | b.ne .Lmutual_align | ||
68 | sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ | ||
69 | lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ | ||
70 | /* | ||
71 | * The input source addresses are at alignment boundary. | ||
72 | * Directly compare eight bytes each time. | ||
73 | */ | ||
74 | .Lloop_aligned: | ||
75 | ldr data1, [src1], #8 | ||
76 | ldr data2, [src2], #8 | ||
77 | .Lstart_realigned: | ||
78 | subs limit_wd, limit_wd, #1 | ||
79 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
80 | csinv endloop, diff, xzr, cs /* Last Dword or differences. */ | ||
81 | cbz endloop, .Lloop_aligned | ||
82 | |||
83 | /* Not reached the limit, must have found a diff. */ | ||
84 | tbz limit_wd, #63, .Lnot_limit | ||
85 | |||
86 | /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ | ||
87 | ands limit, limit, #7 | ||
88 | b.eq .Lnot_limit | ||
89 | /* | ||
90 | * The remained bytes less than 8. It is needed to extract valid data | ||
91 | * from last eight bytes of the intended memory range. | ||
92 | */ | ||
93 | lsl limit, limit, #3 /* bytes-> bits. */ | ||
94 | mov mask, #~0 | ||
95 | CPU_BE( lsr mask, mask, limit ) | ||
96 | CPU_LE( lsl mask, mask, limit ) | ||
97 | bic data1, data1, mask | ||
98 | bic data2, data2, mask | ||
99 | |||
100 | orr diff, diff, mask | ||
101 | b .Lnot_limit | ||
102 | |||
103 | .Lmutual_align: | ||
104 | /* | ||
105 | * Sources are mutually aligned, but are not currently at an | ||
106 | * alignment boundary. Round down the addresses and then mask off | ||
107 | * the bytes that precede the start point. | ||
108 | */ | ||
109 | bic src1, src1, #7 | ||
110 | bic src2, src2, #7 | ||
111 | ldr data1, [src1], #8 | ||
112 | ldr data2, [src2], #8 | ||
113 | /* | ||
114 | * We can not add limit with alignment offset(tmp1) here. Since the | ||
115 | * addition probably make the limit overflown. | ||
116 | */ | ||
117 | sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ | ||
118 | and tmp3, limit_wd, #7 | ||
119 | lsr limit_wd, limit_wd, #3 | ||
120 | add tmp3, tmp3, tmp1 | ||
121 | add limit_wd, limit_wd, tmp3, lsr #3 | ||
122 | add limit, limit, tmp1/* Adjust the limit for the extra. */ | ||
123 | |||
124 | lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ | ||
125 | neg tmp1, tmp1/* Bits to alignment -64. */ | ||
126 | mov tmp2, #~0 | ||
127 | /*mask off the non-intended bytes before the start address.*/ | ||
128 | CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ | ||
129 | /* Little-endian. Early bytes are at LSB. */ | ||
130 | CPU_LE( lsr tmp2, tmp2, tmp1 ) | ||
131 | |||
132 | orr data1, data1, tmp2 | ||
133 | orr data2, data2, tmp2 | ||
134 | b .Lstart_realigned | ||
135 | |||
136 | /*src1 and src2 have different alignment offset.*/ | ||
137 | .Lmisaligned8: | ||
138 | cmp limit, #8 | ||
139 | b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ | ||
140 | |||
141 | and tmp1, src1, #7 | ||
142 | neg tmp1, tmp1 | ||
143 | add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ | ||
144 | and tmp2, src2, #7 | ||
145 | neg tmp2, tmp2 | ||
146 | add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ | ||
147 | subs tmp3, tmp1, tmp2 | ||
148 | csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ | ||
149 | |||
150 | sub limit, limit, pos | ||
151 | /*compare the proceeding bytes in the first 8 byte segment.*/ | ||
152 | .Ltinycmp: | ||
153 | ldrb data1w, [src1], #1 | ||
154 | ldrb data2w, [src2], #1 | ||
155 | subs pos, pos, #1 | ||
156 | ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ | ||
157 | b.eq .Ltinycmp | ||
158 | cbnz pos, 1f /*diff occurred before the last byte.*/ | ||
159 | cmp data1w, data2w | ||
160 | b.eq .Lstart_align | ||
161 | 1: | ||
162 | sub result, data1, data2 | ||
163 | ret | ||
164 | |||
165 | .Lstart_align: | ||
166 | lsr limit_wd, limit, #3 | ||
167 | cbz limit_wd, .Lremain8 | ||
168 | |||
169 | ands xzr, src1, #7 | ||
170 | b.eq .Lrecal_offset | ||
171 | /*process more leading bytes to make src1 aligned...*/ | ||
172 | add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ | ||
173 | add src2, src2, tmp3 | ||
174 | sub limit, limit, tmp3 | ||
175 | lsr limit_wd, limit, #3 | ||
176 | cbz limit_wd, .Lremain8 | ||
177 | /*load 8 bytes from aligned SRC1..*/ | ||
178 | ldr data1, [src1], #8 | ||
179 | ldr data2, [src2], #8 | ||
180 | |||
181 | subs limit_wd, limit_wd, #1 | ||
182 | eor diff, data1, data2 /*Non-zero if differences found.*/ | ||
183 | csinv endloop, diff, xzr, ne | ||
184 | cbnz endloop, .Lunequal_proc | ||
185 | /*How far is the current SRC2 from the alignment boundary...*/ | ||
186 | and tmp3, tmp3, #7 | ||
187 | |||
188 | .Lrecal_offset:/*src1 is aligned now..*/ | ||
189 | neg pos, tmp3 | ||
190 | .Lloopcmp_proc: | ||
191 | /* | ||
192 | * Divide the eight bytes into two parts. First,backwards the src2 | ||
193 | * to an alignment boundary,load eight bytes and compare from | ||
194 | * the SRC2 alignment boundary. If all 8 bytes are equal,then start | ||
195 | * the second part's comparison. Otherwise finish the comparison. | ||
196 | * This special handle can garantee all the accesses are in the | ||
197 | * thread/task space in avoid to overrange access. | ||
198 | */ | ||
199 | ldr data1, [src1,pos] | ||
200 | ldr data2, [src2,pos] | ||
201 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
202 | cbnz diff, .Lnot_limit | ||
203 | |||
204 | /*The second part process*/ | ||
205 | ldr data1, [src1], #8 | ||
206 | ldr data2, [src2], #8 | ||
207 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
208 | subs limit_wd, limit_wd, #1 | ||
209 | csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ | ||
210 | cbz endloop, .Lloopcmp_proc | ||
211 | .Lunequal_proc: | ||
212 | cbz diff, .Lremain8 | ||
213 | |||
214 | /*There is differnence occured in the latest comparison.*/ | ||
215 | .Lnot_limit: | ||
216 | /* | ||
217 | * For little endian,reverse the low significant equal bits into MSB,then | ||
218 | * following CLZ can find how many equal bits exist. | ||
219 | */ | ||
220 | CPU_LE( rev diff, diff ) | ||
221 | CPU_LE( rev data1, data1 ) | ||
222 | CPU_LE( rev data2, data2 ) | ||
223 | |||
224 | /* | ||
225 | * The MS-non-zero bit of DIFF marks either the first bit | ||
226 | * that is different, or the end of the significant data. | ||
227 | * Shifting left now will bring the critical information into the | ||
228 | * top bits. | ||
229 | */ | ||
230 | clz pos, diff | ||
231 | lsl data1, data1, pos | ||
232 | lsl data2, data2, pos | ||
233 | /* | ||
234 | * We need to zero-extend (char is unsigned) the value and then | ||
235 | * perform a signed subtraction. | ||
236 | */ | ||
237 | lsr data1, data1, #56 | ||
238 | sub result, data1, data2, lsr #56 | ||
239 | ret | ||
240 | |||
241 | .Lremain8: | ||
242 | /* Limit % 8 == 0 =>. all data are equal.*/ | ||
243 | ands limit, limit, #7 | ||
244 | b.eq .Lret0 | ||
245 | |||
246 | .Ltiny8proc: | ||
247 | ldrb data1w, [src1], #1 | ||
248 | ldrb data2w, [src2], #1 | ||
249 | subs limit, limit, #1 | ||
250 | |||
251 | ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ | ||
252 | b.eq .Ltiny8proc | ||
253 | sub result, data1, data2 | ||
254 | ret | ||
255 | .Lret0: | ||
256 | mov result, #0 | ||
257 | ret | ||
258 | ENDPROC(memcmp) | ||
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 27b5003609b6..8a9a96d3ddae 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S | |||
@@ -1,5 +1,13 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2013 ARM Ltd. | 2 | * Copyright (C) 2013 ARM Ltd. |
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
3 | * | 11 | * |
4 | * This program is free software; you can redistribute it and/or modify | 12 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License version 2 as | 13 | * it under the terms of the GNU General Public License version 2 as |
@@ -16,6 +24,7 @@ | |||
16 | 24 | ||
17 | #include <linux/linkage.h> | 25 | #include <linux/linkage.h> |
18 | #include <asm/assembler.h> | 26 | #include <asm/assembler.h> |
27 | #include <asm/cache.h> | ||
19 | 28 | ||
20 | /* | 29 | /* |
21 | * Copy a buffer from src to dest (alignment handled by the hardware) | 30 | * Copy a buffer from src to dest (alignment handled by the hardware) |
@@ -27,27 +36,166 @@ | |||
27 | * Returns: | 36 | * Returns: |
28 | * x0 - dest | 37 | * x0 - dest |
29 | */ | 38 | */ |
39 | dstin .req x0 | ||
40 | src .req x1 | ||
41 | count .req x2 | ||
42 | tmp1 .req x3 | ||
43 | tmp1w .req w3 | ||
44 | tmp2 .req x4 | ||
45 | tmp2w .req w4 | ||
46 | tmp3 .req x5 | ||
47 | tmp3w .req w5 | ||
48 | dst .req x6 | ||
49 | |||
50 | A_l .req x7 | ||
51 | A_h .req x8 | ||
52 | B_l .req x9 | ||
53 | B_h .req x10 | ||
54 | C_l .req x11 | ||
55 | C_h .req x12 | ||
56 | D_l .req x13 | ||
57 | D_h .req x14 | ||
58 | |||
30 | ENTRY(memcpy) | 59 | ENTRY(memcpy) |
31 | mov x4, x0 | 60 | mov dst, dstin |
32 | subs x2, x2, #8 | 61 | cmp count, #16 |
33 | b.mi 2f | 62 | /*When memory length is less than 16, the accessed are not aligned.*/ |
34 | 1: ldr x3, [x1], #8 | 63 | b.lo .Ltiny15 |
35 | subs x2, x2, #8 | 64 | |
36 | str x3, [x4], #8 | 65 | neg tmp2, src |
37 | b.pl 1b | 66 | ands tmp2, tmp2, #15/* Bytes to reach alignment. */ |
38 | 2: adds x2, x2, #4 | 67 | b.eq .LSrcAligned |
39 | b.mi 3f | 68 | sub count, count, tmp2 |
40 | ldr w3, [x1], #4 | 69 | /* |
41 | sub x2, x2, #4 | 70 | * Copy the leading memory data from src to dst in an increasing |
42 | str w3, [x4], #4 | 71 | * address order.By this way,the risk of overwritting the source |
43 | 3: adds x2, x2, #2 | 72 | * memory data is eliminated when the distance between src and |
44 | b.mi 4f | 73 | * dst is less than 16. The memory accesses here are alignment. |
45 | ldrh w3, [x1], #2 | 74 | */ |
46 | sub x2, x2, #2 | 75 | tbz tmp2, #0, 1f |
47 | strh w3, [x4], #2 | 76 | ldrb tmp1w, [src], #1 |
48 | 4: adds x2, x2, #1 | 77 | strb tmp1w, [dst], #1 |
49 | b.mi 5f | 78 | 1: |
50 | ldrb w3, [x1] | 79 | tbz tmp2, #1, 2f |
51 | strb w3, [x4] | 80 | ldrh tmp1w, [src], #2 |
52 | 5: ret | 81 | strh tmp1w, [dst], #2 |
82 | 2: | ||
83 | tbz tmp2, #2, 3f | ||
84 | ldr tmp1w, [src], #4 | ||
85 | str tmp1w, [dst], #4 | ||
86 | 3: | ||
87 | tbz tmp2, #3, .LSrcAligned | ||
88 | ldr tmp1, [src],#8 | ||
89 | str tmp1, [dst],#8 | ||
90 | |||
91 | .LSrcAligned: | ||
92 | cmp count, #64 | ||
93 | b.ge .Lcpy_over64 | ||
94 | /* | ||
95 | * Deal with small copies quickly by dropping straight into the | ||
96 | * exit block. | ||
97 | */ | ||
98 | .Ltail63: | ||
99 | /* | ||
100 | * Copy up to 48 bytes of data. At this point we only need the | ||
101 | * bottom 6 bits of count to be accurate. | ||
102 | */ | ||
103 | ands tmp1, count, #0x30 | ||
104 | b.eq .Ltiny15 | ||
105 | cmp tmp1w, #0x20 | ||
106 | b.eq 1f | ||
107 | b.lt 2f | ||
108 | ldp A_l, A_h, [src], #16 | ||
109 | stp A_l, A_h, [dst], #16 | ||
110 | 1: | ||
111 | ldp A_l, A_h, [src], #16 | ||
112 | stp A_l, A_h, [dst], #16 | ||
113 | 2: | ||
114 | ldp A_l, A_h, [src], #16 | ||
115 | stp A_l, A_h, [dst], #16 | ||
116 | .Ltiny15: | ||
117 | /* | ||
118 | * Prefer to break one ldp/stp into several load/store to access | ||
119 | * memory in an increasing address order,rather than to load/store 16 | ||
120 | * bytes from (src-16) to (dst-16) and to backward the src to aligned | ||
121 | * address,which way is used in original cortex memcpy. If keeping | ||
122 | * the original memcpy process here, memmove need to satisfy the | ||
123 | * precondition that src address is at least 16 bytes bigger than dst | ||
124 | * address,otherwise some source data will be overwritten when memove | ||
125 | * call memcpy directly. To make memmove simpler and decouple the | ||
126 | * memcpy's dependency on memmove, withdrew the original process. | ||
127 | */ | ||
128 | tbz count, #3, 1f | ||
129 | ldr tmp1, [src], #8 | ||
130 | str tmp1, [dst], #8 | ||
131 | 1: | ||
132 | tbz count, #2, 2f | ||
133 | ldr tmp1w, [src], #4 | ||
134 | str tmp1w, [dst], #4 | ||
135 | 2: | ||
136 | tbz count, #1, 3f | ||
137 | ldrh tmp1w, [src], #2 | ||
138 | strh tmp1w, [dst], #2 | ||
139 | 3: | ||
140 | tbz count, #0, .Lexitfunc | ||
141 | ldrb tmp1w, [src] | ||
142 | strb tmp1w, [dst] | ||
143 | |||
144 | .Lexitfunc: | ||
145 | ret | ||
146 | |||
147 | .Lcpy_over64: | ||
148 | subs count, count, #128 | ||
149 | b.ge .Lcpy_body_large | ||
150 | /* | ||
151 | * Less than 128 bytes to copy, so handle 64 here and then jump | ||
152 | * to the tail. | ||
153 | */ | ||
154 | ldp A_l, A_h, [src],#16 | ||
155 | stp A_l, A_h, [dst],#16 | ||
156 | ldp B_l, B_h, [src],#16 | ||
157 | ldp C_l, C_h, [src],#16 | ||
158 | stp B_l, B_h, [dst],#16 | ||
159 | stp C_l, C_h, [dst],#16 | ||
160 | ldp D_l, D_h, [src],#16 | ||
161 | stp D_l, D_h, [dst],#16 | ||
162 | |||
163 | tst count, #0x3f | ||
164 | b.ne .Ltail63 | ||
165 | ret | ||
166 | |||
167 | /* | ||
168 | * Critical loop. Start at a new cache line boundary. Assuming | ||
169 | * 64 bytes per line this ensures the entire loop is in one line. | ||
170 | */ | ||
171 | .p2align L1_CACHE_SHIFT | ||
172 | .Lcpy_body_large: | ||
173 | /* pre-get 64 bytes data. */ | ||
174 | ldp A_l, A_h, [src],#16 | ||
175 | ldp B_l, B_h, [src],#16 | ||
176 | ldp C_l, C_h, [src],#16 | ||
177 | ldp D_l, D_h, [src],#16 | ||
178 | 1: | ||
179 | /* | ||
180 | * interlace the load of next 64 bytes data block with store of the last | ||
181 | * loaded 64 bytes data. | ||
182 | */ | ||
183 | stp A_l, A_h, [dst],#16 | ||
184 | ldp A_l, A_h, [src],#16 | ||
185 | stp B_l, B_h, [dst],#16 | ||
186 | ldp B_l, B_h, [src],#16 | ||
187 | stp C_l, C_h, [dst],#16 | ||
188 | ldp C_l, C_h, [src],#16 | ||
189 | stp D_l, D_h, [dst],#16 | ||
190 | ldp D_l, D_h, [src],#16 | ||
191 | subs count, count, #64 | ||
192 | b.ge 1b | ||
193 | stp A_l, A_h, [dst],#16 | ||
194 | stp B_l, B_h, [dst],#16 | ||
195 | stp C_l, C_h, [dst],#16 | ||
196 | stp D_l, D_h, [dst],#16 | ||
197 | |||
198 | tst count, #0x3f | ||
199 | b.ne .Ltail63 | ||
200 | ret | ||
53 | ENDPROC(memcpy) | 201 | ENDPROC(memcpy) |
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S index b79fdfa42d39..57b19ea2dad4 100644 --- a/arch/arm64/lib/memmove.S +++ b/arch/arm64/lib/memmove.S | |||
@@ -1,5 +1,13 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2013 ARM Ltd. | 2 | * Copyright (C) 2013 ARM Ltd. |
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
3 | * | 11 | * |
4 | * This program is free software; you can redistribute it and/or modify | 12 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License version 2 as | 13 | * it under the terms of the GNU General Public License version 2 as |
@@ -16,6 +24,7 @@ | |||
16 | 24 | ||
17 | #include <linux/linkage.h> | 25 | #include <linux/linkage.h> |
18 | #include <asm/assembler.h> | 26 | #include <asm/assembler.h> |
27 | #include <asm/cache.h> | ||
19 | 28 | ||
20 | /* | 29 | /* |
21 | * Move a buffer from src to test (alignment handled by the hardware). | 30 | * Move a buffer from src to test (alignment handled by the hardware). |
@@ -28,30 +37,161 @@ | |||
28 | * Returns: | 37 | * Returns: |
29 | * x0 - dest | 38 | * x0 - dest |
30 | */ | 39 | */ |
40 | dstin .req x0 | ||
41 | src .req x1 | ||
42 | count .req x2 | ||
43 | tmp1 .req x3 | ||
44 | tmp1w .req w3 | ||
45 | tmp2 .req x4 | ||
46 | tmp2w .req w4 | ||
47 | tmp3 .req x5 | ||
48 | tmp3w .req w5 | ||
49 | dst .req x6 | ||
50 | |||
51 | A_l .req x7 | ||
52 | A_h .req x8 | ||
53 | B_l .req x9 | ||
54 | B_h .req x10 | ||
55 | C_l .req x11 | ||
56 | C_h .req x12 | ||
57 | D_l .req x13 | ||
58 | D_h .req x14 | ||
59 | |||
31 | ENTRY(memmove) | 60 | ENTRY(memmove) |
32 | cmp x0, x1 | 61 | cmp dstin, src |
33 | b.ls memcpy | 62 | b.lo memcpy |
34 | add x4, x0, x2 | 63 | add tmp1, src, count |
35 | add x1, x1, x2 | 64 | cmp dstin, tmp1 |
36 | subs x2, x2, #8 | 65 | b.hs memcpy /* No overlap. */ |
37 | b.mi 2f | 66 | |
38 | 1: ldr x3, [x1, #-8]! | 67 | add dst, dstin, count |
39 | subs x2, x2, #8 | 68 | add src, src, count |
40 | str x3, [x4, #-8]! | 69 | cmp count, #16 |
41 | b.pl 1b | 70 | b.lo .Ltail15 /*probably non-alignment accesses.*/ |
42 | 2: adds x2, x2, #4 | 71 | |
43 | b.mi 3f | 72 | ands tmp2, src, #15 /* Bytes to reach alignment. */ |
44 | ldr w3, [x1, #-4]! | 73 | b.eq .LSrcAligned |
45 | sub x2, x2, #4 | 74 | sub count, count, tmp2 |
46 | str w3, [x4, #-4]! | 75 | /* |
47 | 3: adds x2, x2, #2 | 76 | * process the aligned offset length to make the src aligned firstly. |
48 | b.mi 4f | 77 | * those extra instructions' cost is acceptable. It also make the |
49 | ldrh w3, [x1, #-2]! | 78 | * coming accesses are based on aligned address. |
50 | sub x2, x2, #2 | 79 | */ |
51 | strh w3, [x4, #-2]! | 80 | tbz tmp2, #0, 1f |
52 | 4: adds x2, x2, #1 | 81 | ldrb tmp1w, [src, #-1]! |
53 | b.mi 5f | 82 | strb tmp1w, [dst, #-1]! |
54 | ldrb w3, [x1, #-1] | 83 | 1: |
55 | strb w3, [x4, #-1] | 84 | tbz tmp2, #1, 2f |
56 | 5: ret | 85 | ldrh tmp1w, [src, #-2]! |
86 | strh tmp1w, [dst, #-2]! | ||
87 | 2: | ||
88 | tbz tmp2, #2, 3f | ||
89 | ldr tmp1w, [src, #-4]! | ||
90 | str tmp1w, [dst, #-4]! | ||
91 | 3: | ||
92 | tbz tmp2, #3, .LSrcAligned | ||
93 | ldr tmp1, [src, #-8]! | ||
94 | str tmp1, [dst, #-8]! | ||
95 | |||
96 | .LSrcAligned: | ||
97 | cmp count, #64 | ||
98 | b.ge .Lcpy_over64 | ||
99 | |||
100 | /* | ||
101 | * Deal with small copies quickly by dropping straight into the | ||
102 | * exit block. | ||
103 | */ | ||
104 | .Ltail63: | ||
105 | /* | ||
106 | * Copy up to 48 bytes of data. At this point we only need the | ||
107 | * bottom 6 bits of count to be accurate. | ||
108 | */ | ||
109 | ands tmp1, count, #0x30 | ||
110 | b.eq .Ltail15 | ||
111 | cmp tmp1w, #0x20 | ||
112 | b.eq 1f | ||
113 | b.lt 2f | ||
114 | ldp A_l, A_h, [src, #-16]! | ||
115 | stp A_l, A_h, [dst, #-16]! | ||
116 | 1: | ||
117 | ldp A_l, A_h, [src, #-16]! | ||
118 | stp A_l, A_h, [dst, #-16]! | ||
119 | 2: | ||
120 | ldp A_l, A_h, [src, #-16]! | ||
121 | stp A_l, A_h, [dst, #-16]! | ||
122 | |||
123 | .Ltail15: | ||
124 | tbz count, #3, 1f | ||
125 | ldr tmp1, [src, #-8]! | ||
126 | str tmp1, [dst, #-8]! | ||
127 | 1: | ||
128 | tbz count, #2, 2f | ||
129 | ldr tmp1w, [src, #-4]! | ||
130 | str tmp1w, [dst, #-4]! | ||
131 | 2: | ||
132 | tbz count, #1, 3f | ||
133 | ldrh tmp1w, [src, #-2]! | ||
134 | strh tmp1w, [dst, #-2]! | ||
135 | 3: | ||
136 | tbz count, #0, .Lexitfunc | ||
137 | ldrb tmp1w, [src, #-1] | ||
138 | strb tmp1w, [dst, #-1] | ||
139 | |||
140 | .Lexitfunc: | ||
141 | ret | ||
142 | |||
143 | .Lcpy_over64: | ||
144 | subs count, count, #128 | ||
145 | b.ge .Lcpy_body_large | ||
146 | /* | ||
147 | * Less than 128 bytes to copy, so handle 64 bytes here and then jump | ||
148 | * to the tail. | ||
149 | */ | ||
150 | ldp A_l, A_h, [src, #-16] | ||
151 | stp A_l, A_h, [dst, #-16] | ||
152 | ldp B_l, B_h, [src, #-32] | ||
153 | ldp C_l, C_h, [src, #-48] | ||
154 | stp B_l, B_h, [dst, #-32] | ||
155 | stp C_l, C_h, [dst, #-48] | ||
156 | ldp D_l, D_h, [src, #-64]! | ||
157 | stp D_l, D_h, [dst, #-64]! | ||
158 | |||
159 | tst count, #0x3f | ||
160 | b.ne .Ltail63 | ||
161 | ret | ||
162 | |||
163 | /* | ||
164 | * Critical loop. Start at a new cache line boundary. Assuming | ||
165 | * 64 bytes per line this ensures the entire loop is in one line. | ||
166 | */ | ||
167 | .p2align L1_CACHE_SHIFT | ||
168 | .Lcpy_body_large: | ||
169 | /* pre-load 64 bytes data. */ | ||
170 | ldp A_l, A_h, [src, #-16] | ||
171 | ldp B_l, B_h, [src, #-32] | ||
172 | ldp C_l, C_h, [src, #-48] | ||
173 | ldp D_l, D_h, [src, #-64]! | ||
174 | 1: | ||
175 | /* | ||
176 | * interlace the load of next 64 bytes data block with store of the last | ||
177 | * loaded 64 bytes data. | ||
178 | */ | ||
179 | stp A_l, A_h, [dst, #-16] | ||
180 | ldp A_l, A_h, [src, #-16] | ||
181 | stp B_l, B_h, [dst, #-32] | ||
182 | ldp B_l, B_h, [src, #-32] | ||
183 | stp C_l, C_h, [dst, #-48] | ||
184 | ldp C_l, C_h, [src, #-48] | ||
185 | stp D_l, D_h, [dst, #-64]! | ||
186 | ldp D_l, D_h, [src, #-64]! | ||
187 | subs count, count, #64 | ||
188 | b.ge 1b | ||
189 | stp A_l, A_h, [dst, #-16] | ||
190 | stp B_l, B_h, [dst, #-32] | ||
191 | stp C_l, C_h, [dst, #-48] | ||
192 | stp D_l, D_h, [dst, #-64]! | ||
193 | |||
194 | tst count, #0x3f | ||
195 | b.ne .Ltail63 | ||
196 | ret | ||
57 | ENDPROC(memmove) | 197 | ENDPROC(memmove) |
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index 87e4a68fbbbc..7c72dfd36b63 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S | |||
@@ -1,5 +1,13 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2013 ARM Ltd. | 2 | * Copyright (C) 2013 ARM Ltd. |
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
3 | * | 11 | * |
4 | * This program is free software; you can redistribute it and/or modify | 12 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License version 2 as | 13 | * it under the terms of the GNU General Public License version 2 as |
@@ -16,6 +24,7 @@ | |||
16 | 24 | ||
17 | #include <linux/linkage.h> | 25 | #include <linux/linkage.h> |
18 | #include <asm/assembler.h> | 26 | #include <asm/assembler.h> |
27 | #include <asm/cache.h> | ||
19 | 28 | ||
20 | /* | 29 | /* |
21 | * Fill in the buffer with character c (alignment handled by the hardware) | 30 | * Fill in the buffer with character c (alignment handled by the hardware) |
@@ -27,27 +36,181 @@ | |||
27 | * Returns: | 36 | * Returns: |
28 | * x0 - buf | 37 | * x0 - buf |
29 | */ | 38 | */ |
39 | |||
40 | dstin .req x0 | ||
41 | val .req w1 | ||
42 | count .req x2 | ||
43 | tmp1 .req x3 | ||
44 | tmp1w .req w3 | ||
45 | tmp2 .req x4 | ||
46 | tmp2w .req w4 | ||
47 | zva_len_x .req x5 | ||
48 | zva_len .req w5 | ||
49 | zva_bits_x .req x6 | ||
50 | |||
51 | A_l .req x7 | ||
52 | A_lw .req w7 | ||
53 | dst .req x8 | ||
54 | tmp3w .req w9 | ||
55 | tmp3 .req x9 | ||
56 | |||
30 | ENTRY(memset) | 57 | ENTRY(memset) |
31 | mov x4, x0 | 58 | mov dst, dstin /* Preserve return value. */ |
32 | and w1, w1, #0xff | 59 | and A_lw, val, #255 |
33 | orr w1, w1, w1, lsl #8 | 60 | orr A_lw, A_lw, A_lw, lsl #8 |
34 | orr w1, w1, w1, lsl #16 | 61 | orr A_lw, A_lw, A_lw, lsl #16 |
35 | orr x1, x1, x1, lsl #32 | 62 | orr A_l, A_l, A_l, lsl #32 |
36 | subs x2, x2, #8 | 63 | |
37 | b.mi 2f | 64 | cmp count, #15 |
38 | 1: str x1, [x4], #8 | 65 | b.hi .Lover16_proc |
39 | subs x2, x2, #8 | 66 | /*All store maybe are non-aligned..*/ |
40 | b.pl 1b | 67 | tbz count, #3, 1f |
41 | 2: adds x2, x2, #4 | 68 | str A_l, [dst], #8 |
42 | b.mi 3f | 69 | 1: |
43 | sub x2, x2, #4 | 70 | tbz count, #2, 2f |
44 | str w1, [x4], #4 | 71 | str A_lw, [dst], #4 |
45 | 3: adds x2, x2, #2 | 72 | 2: |
46 | b.mi 4f | 73 | tbz count, #1, 3f |
47 | sub x2, x2, #2 | 74 | strh A_lw, [dst], #2 |
48 | strh w1, [x4], #2 | 75 | 3: |
49 | 4: adds x2, x2, #1 | 76 | tbz count, #0, 4f |
50 | b.mi 5f | 77 | strb A_lw, [dst] |
51 | strb w1, [x4] | 78 | 4: |
52 | 5: ret | 79 | ret |
80 | |||
81 | .Lover16_proc: | ||
82 | /*Whether the start address is aligned with 16.*/ | ||
83 | neg tmp2, dst | ||
84 | ands tmp2, tmp2, #15 | ||
85 | b.eq .Laligned | ||
86 | /* | ||
87 | * The count is not less than 16, we can use stp to store the start 16 bytes, | ||
88 | * then adjust the dst aligned with 16.This process will make the current | ||
89 | * memory address at alignment boundary. | ||
90 | */ | ||
91 | stp A_l, A_l, [dst] /*non-aligned store..*/ | ||
92 | /*make the dst aligned..*/ | ||
93 | sub count, count, tmp2 | ||
94 | add dst, dst, tmp2 | ||
95 | |||
96 | .Laligned: | ||
97 | cbz A_l, .Lzero_mem | ||
98 | |||
99 | .Ltail_maybe_long: | ||
100 | cmp count, #64 | ||
101 | b.ge .Lnot_short | ||
102 | .Ltail63: | ||
103 | ands tmp1, count, #0x30 | ||
104 | b.eq 3f | ||
105 | cmp tmp1w, #0x20 | ||
106 | b.eq 1f | ||
107 | b.lt 2f | ||
108 | stp A_l, A_l, [dst], #16 | ||
109 | 1: | ||
110 | stp A_l, A_l, [dst], #16 | ||
111 | 2: | ||
112 | stp A_l, A_l, [dst], #16 | ||
113 | /* | ||
114 | * The last store length is less than 16,use stp to write last 16 bytes. | ||
115 | * It will lead some bytes written twice and the access is non-aligned. | ||
116 | */ | ||
117 | 3: | ||
118 | ands count, count, #15 | ||
119 | cbz count, 4f | ||
120 | add dst, dst, count | ||
121 | stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ | ||
122 | 4: | ||
123 | ret | ||
124 | |||
125 | /* | ||
126 | * Critical loop. Start at a new cache line boundary. Assuming | ||
127 | * 64 bytes per line, this ensures the entire loop is in one line. | ||
128 | */ | ||
129 | .p2align L1_CACHE_SHIFT | ||
130 | .Lnot_short: | ||
131 | sub dst, dst, #16/* Pre-bias. */ | ||
132 | sub count, count, #64 | ||
133 | 1: | ||
134 | stp A_l, A_l, [dst, #16] | ||
135 | stp A_l, A_l, [dst, #32] | ||
136 | stp A_l, A_l, [dst, #48] | ||
137 | stp A_l, A_l, [dst, #64]! | ||
138 | subs count, count, #64 | ||
139 | b.ge 1b | ||
140 | tst count, #0x3f | ||
141 | add dst, dst, #16 | ||
142 | b.ne .Ltail63 | ||
143 | .Lexitfunc: | ||
144 | ret | ||
145 | |||
146 | /* | ||
147 | * For zeroing memory, check to see if we can use the ZVA feature to | ||
148 | * zero entire 'cache' lines. | ||
149 | */ | ||
150 | .Lzero_mem: | ||
151 | cmp count, #63 | ||
152 | b.le .Ltail63 | ||
153 | /* | ||
154 | * For zeroing small amounts of memory, it's not worth setting up | ||
155 | * the line-clear code. | ||
156 | */ | ||
157 | cmp count, #128 | ||
158 | b.lt .Lnot_short /*count is at least 128 bytes*/ | ||
159 | |||
160 | mrs tmp1, dczid_el0 | ||
161 | tbnz tmp1, #4, .Lnot_short | ||
162 | mov tmp3w, #4 | ||
163 | and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ | ||
164 | lsl zva_len, tmp3w, zva_len | ||
165 | |||
166 | ands tmp3w, zva_len, #63 | ||
167 | /* | ||
168 | * ensure the zva_len is not less than 64. | ||
169 | * It is not meaningful to use ZVA if the block size is less than 64. | ||
170 | */ | ||
171 | b.ne .Lnot_short | ||
172 | .Lzero_by_line: | ||
173 | /* | ||
174 | * Compute how far we need to go to become suitably aligned. We're | ||
175 | * already at quad-word alignment. | ||
176 | */ | ||
177 | cmp count, zva_len_x | ||
178 | b.lt .Lnot_short /* Not enough to reach alignment. */ | ||
179 | sub zva_bits_x, zva_len_x, #1 | ||
180 | neg tmp2, dst | ||
181 | ands tmp2, tmp2, zva_bits_x | ||
182 | b.eq 2f /* Already aligned. */ | ||
183 | /* Not aligned, check that there's enough to copy after alignment.*/ | ||
184 | sub tmp1, count, tmp2 | ||
185 | /* | ||
186 | * grantee the remain length to be ZVA is bigger than 64, | ||
187 | * avoid to make the 2f's process over mem range.*/ | ||
188 | cmp tmp1, #64 | ||
189 | ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ | ||
190 | b.lt .Lnot_short | ||
191 | /* | ||
192 | * We know that there's at least 64 bytes to zero and that it's safe | ||
193 | * to overrun by 64 bytes. | ||
194 | */ | ||
195 | mov count, tmp1 | ||
196 | 1: | ||
197 | stp A_l, A_l, [dst] | ||
198 | stp A_l, A_l, [dst, #16] | ||
199 | stp A_l, A_l, [dst, #32] | ||
200 | subs tmp2, tmp2, #64 | ||
201 | stp A_l, A_l, [dst, #48] | ||
202 | add dst, dst, #64 | ||
203 | b.ge 1b | ||
204 | /* We've overrun a bit, so adjust dst downwards.*/ | ||
205 | add dst, dst, tmp2 | ||
206 | 2: | ||
207 | sub count, count, zva_len_x | ||
208 | 3: | ||
209 | dc zva, dst | ||
210 | add dst, dst, zva_len_x | ||
211 | subs count, count, zva_len_x | ||
212 | b.ge 3b | ||
213 | ands count, count, zva_bits_x | ||
214 | b.ne .Ltail_maybe_long | ||
215 | ret | ||
53 | ENDPROC(memset) | 216 | ENDPROC(memset) |
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 000000000000..42f828b06c59 --- /dev/null +++ b/arch/arm64/lib/strcmp.S | |||
@@ -0,0 +1,234 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2013 ARM Ltd. | ||
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License version 2 as | ||
14 | * published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
23 | */ | ||
24 | |||
25 | #include <linux/linkage.h> | ||
26 | #include <asm/assembler.h> | ||
27 | |||
28 | /* | ||
29 | * compare two strings | ||
30 | * | ||
31 | * Parameters: | ||
32 | * x0 - const string 1 pointer | ||
33 | * x1 - const string 2 pointer | ||
34 | * Returns: | ||
35 | * x0 - an integer less than, equal to, or greater than zero | ||
36 | * if s1 is found, respectively, to be less than, to match, | ||
37 | * or be greater than s2. | ||
38 | */ | ||
39 | |||
40 | #define REP8_01 0x0101010101010101 | ||
41 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||
42 | #define REP8_80 0x8080808080808080 | ||
43 | |||
44 | /* Parameters and result. */ | ||
45 | src1 .req x0 | ||
46 | src2 .req x1 | ||
47 | result .req x0 | ||
48 | |||
49 | /* Internal variables. */ | ||
50 | data1 .req x2 | ||
51 | data1w .req w2 | ||
52 | data2 .req x3 | ||
53 | data2w .req w3 | ||
54 | has_nul .req x4 | ||
55 | diff .req x5 | ||
56 | syndrome .req x6 | ||
57 | tmp1 .req x7 | ||
58 | tmp2 .req x8 | ||
59 | tmp3 .req x9 | ||
60 | zeroones .req x10 | ||
61 | pos .req x11 | ||
62 | |||
63 | ENTRY(strcmp) | ||
64 | eor tmp1, src1, src2 | ||
65 | mov zeroones, #REP8_01 | ||
66 | tst tmp1, #7 | ||
67 | b.ne .Lmisaligned8 | ||
68 | ands tmp1, src1, #7 | ||
69 | b.ne .Lmutual_align | ||
70 | |||
71 | /* | ||
72 | * NUL detection works on the principle that (X - 1) & (~X) & 0x80 | ||
73 | * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | ||
74 | * can be done in parallel across the entire word. | ||
75 | */ | ||
76 | .Lloop_aligned: | ||
77 | ldr data1, [src1], #8 | ||
78 | ldr data2, [src2], #8 | ||
79 | .Lstart_realigned: | ||
80 | sub tmp1, data1, zeroones | ||
81 | orr tmp2, data1, #REP8_7f | ||
82 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
83 | bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ | ||
84 | orr syndrome, diff, has_nul | ||
85 | cbz syndrome, .Lloop_aligned | ||
86 | b .Lcal_cmpresult | ||
87 | |||
88 | .Lmutual_align: | ||
89 | /* | ||
90 | * Sources are mutually aligned, but are not currently at an | ||
91 | * alignment boundary. Round down the addresses and then mask off | ||
92 | * the bytes that preceed the start point. | ||
93 | */ | ||
94 | bic src1, src1, #7 | ||
95 | bic src2, src2, #7 | ||
96 | lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ | ||
97 | ldr data1, [src1], #8 | ||
98 | neg tmp1, tmp1 /* Bits to alignment -64. */ | ||
99 | ldr data2, [src2], #8 | ||
100 | mov tmp2, #~0 | ||
101 | /* Big-endian. Early bytes are at MSB. */ | ||
102 | CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ | ||
103 | /* Little-endian. Early bytes are at LSB. */ | ||
104 | CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ | ||
105 | |||
106 | orr data1, data1, tmp2 | ||
107 | orr data2, data2, tmp2 | ||
108 | b .Lstart_realigned | ||
109 | |||
110 | .Lmisaligned8: | ||
111 | /* | ||
112 | * Get the align offset length to compare per byte first. | ||
113 | * After this process, one string's address will be aligned. | ||
114 | */ | ||
115 | and tmp1, src1, #7 | ||
116 | neg tmp1, tmp1 | ||
117 | add tmp1, tmp1, #8 | ||
118 | and tmp2, src2, #7 | ||
119 | neg tmp2, tmp2 | ||
120 | add tmp2, tmp2, #8 | ||
121 | subs tmp3, tmp1, tmp2 | ||
122 | csel pos, tmp1, tmp2, hi /*Choose the maximum. */ | ||
123 | .Ltinycmp: | ||
124 | ldrb data1w, [src1], #1 | ||
125 | ldrb data2w, [src2], #1 | ||
126 | subs pos, pos, #1 | ||
127 | ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ | ||
128 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ | ||
129 | b.eq .Ltinycmp | ||
130 | cbnz pos, 1f /*find the null or unequal...*/ | ||
131 | cmp data1w, #1 | ||
132 | ccmp data1w, data2w, #0, cs | ||
133 | b.eq .Lstart_align /*the last bytes are equal....*/ | ||
134 | 1: | ||
135 | sub result, data1, data2 | ||
136 | ret | ||
137 | |||
138 | .Lstart_align: | ||
139 | ands xzr, src1, #7 | ||
140 | b.eq .Lrecal_offset | ||
141 | /*process more leading bytes to make str1 aligned...*/ | ||
142 | add src1, src1, tmp3 | ||
143 | add src2, src2, tmp3 | ||
144 | /*load 8 bytes from aligned str1 and non-aligned str2..*/ | ||
145 | ldr data1, [src1], #8 | ||
146 | ldr data2, [src2], #8 | ||
147 | |||
148 | sub tmp1, data1, zeroones | ||
149 | orr tmp2, data1, #REP8_7f | ||
150 | bic has_nul, tmp1, tmp2 | ||
151 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
152 | orr syndrome, diff, has_nul | ||
153 | cbnz syndrome, .Lcal_cmpresult | ||
154 | /*How far is the current str2 from the alignment boundary...*/ | ||
155 | and tmp3, tmp3, #7 | ||
156 | .Lrecal_offset: | ||
157 | neg pos, tmp3 | ||
158 | .Lloopcmp_proc: | ||
159 | /* | ||
160 | * Divide the eight bytes into two parts. First,backwards the src2 | ||
161 | * to an alignment boundary,load eight bytes from the SRC2 alignment | ||
162 | * boundary,then compare with the relative bytes from SRC1. | ||
163 | * If all 8 bytes are equal,then start the second part's comparison. | ||
164 | * Otherwise finish the comparison. | ||
165 | * This special handle can garantee all the accesses are in the | ||
166 | * thread/task space in avoid to overrange access. | ||
167 | */ | ||
168 | ldr data1, [src1,pos] | ||
169 | ldr data2, [src2,pos] | ||
170 | sub tmp1, data1, zeroones | ||
171 | orr tmp2, data1, #REP8_7f | ||
172 | bic has_nul, tmp1, tmp2 | ||
173 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
174 | orr syndrome, diff, has_nul | ||
175 | cbnz syndrome, .Lcal_cmpresult | ||
176 | |||
177 | /*The second part process*/ | ||
178 | ldr data1, [src1], #8 | ||
179 | ldr data2, [src2], #8 | ||
180 | sub tmp1, data1, zeroones | ||
181 | orr tmp2, data1, #REP8_7f | ||
182 | bic has_nul, tmp1, tmp2 | ||
183 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
184 | orr syndrome, diff, has_nul | ||
185 | cbz syndrome, .Lloopcmp_proc | ||
186 | |||
187 | .Lcal_cmpresult: | ||
188 | /* | ||
189 | * reversed the byte-order as big-endian,then CLZ can find the most | ||
190 | * significant zero bits. | ||
191 | */ | ||
192 | CPU_LE( rev syndrome, syndrome ) | ||
193 | CPU_LE( rev data1, data1 ) | ||
194 | CPU_LE( rev data2, data2 ) | ||
195 | |||
196 | /* | ||
197 | * For big-endian we cannot use the trick with the syndrome value | ||
198 | * as carry-propagation can corrupt the upper bits if the trailing | ||
199 | * bytes in the string contain 0x01. | ||
200 | * However, if there is no NUL byte in the dword, we can generate | ||
201 | * the result directly. We ca not just subtract the bytes as the | ||
202 | * MSB might be significant. | ||
203 | */ | ||
204 | CPU_BE( cbnz has_nul, 1f ) | ||
205 | CPU_BE( cmp data1, data2 ) | ||
206 | CPU_BE( cset result, ne ) | ||
207 | CPU_BE( cneg result, result, lo ) | ||
208 | CPU_BE( ret ) | ||
209 | CPU_BE( 1: ) | ||
210 | /*Re-compute the NUL-byte detection, using a byte-reversed value. */ | ||
211 | CPU_BE( rev tmp3, data1 ) | ||
212 | CPU_BE( sub tmp1, tmp3, zeroones ) | ||
213 | CPU_BE( orr tmp2, tmp3, #REP8_7f ) | ||
214 | CPU_BE( bic has_nul, tmp1, tmp2 ) | ||
215 | CPU_BE( rev has_nul, has_nul ) | ||
216 | CPU_BE( orr syndrome, diff, has_nul ) | ||
217 | |||
218 | clz pos, syndrome | ||
219 | /* | ||
220 | * The MS-non-zero bit of the syndrome marks either the first bit | ||
221 | * that is different, or the top bit of the first zero byte. | ||
222 | * Shifting left now will bring the critical information into the | ||
223 | * top bits. | ||
224 | */ | ||
225 | lsl data1, data1, pos | ||
226 | lsl data2, data2, pos | ||
227 | /* | ||
228 | * But we need to zero-extend (char is unsigned) the value and then | ||
229 | * perform a signed 32-bit subtraction. | ||
230 | */ | ||
231 | lsr data1, data1, #56 | ||
232 | sub result, data1, data2, lsr #56 | ||
233 | ret | ||
234 | ENDPROC(strcmp) | ||
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 000000000000..987b68b9ce44 --- /dev/null +++ b/arch/arm64/lib/strlen.S | |||
@@ -0,0 +1,126 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2013 ARM Ltd. | ||
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License version 2 as | ||
14 | * published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
23 | */ | ||
24 | |||
25 | #include <linux/linkage.h> | ||
26 | #include <asm/assembler.h> | ||
27 | |||
28 | /* | ||
29 | * calculate the length of a string | ||
30 | * | ||
31 | * Parameters: | ||
32 | * x0 - const string pointer | ||
33 | * Returns: | ||
34 | * x0 - the return length of specific string | ||
35 | */ | ||
36 | |||
37 | /* Arguments and results. */ | ||
38 | srcin .req x0 | ||
39 | len .req x0 | ||
40 | |||
41 | /* Locals and temporaries. */ | ||
42 | src .req x1 | ||
43 | data1 .req x2 | ||
44 | data2 .req x3 | ||
45 | data2a .req x4 | ||
46 | has_nul1 .req x5 | ||
47 | has_nul2 .req x6 | ||
48 | tmp1 .req x7 | ||
49 | tmp2 .req x8 | ||
50 | tmp3 .req x9 | ||
51 | tmp4 .req x10 | ||
52 | zeroones .req x11 | ||
53 | pos .req x12 | ||
54 | |||
55 | #define REP8_01 0x0101010101010101 | ||
56 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||
57 | #define REP8_80 0x8080808080808080 | ||
58 | |||
59 | ENTRY(strlen) | ||
60 | mov zeroones, #REP8_01 | ||
61 | bic src, srcin, #15 | ||
62 | ands tmp1, srcin, #15 | ||
63 | b.ne .Lmisaligned | ||
64 | /* | ||
65 | * NUL detection works on the principle that (X - 1) & (~X) & 0x80 | ||
66 | * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | ||
67 | * can be done in parallel across the entire word. | ||
68 | */ | ||
69 | /* | ||
70 | * The inner loop deals with two Dwords at a time. This has a | ||
71 | * slightly higher start-up cost, but we should win quite quickly, | ||
72 | * especially on cores with a high number of issue slots per | ||
73 | * cycle, as we get much better parallelism out of the operations. | ||
74 | */ | ||
75 | .Lloop: | ||
76 | ldp data1, data2, [src], #16 | ||
77 | .Lrealigned: | ||
78 | sub tmp1, data1, zeroones | ||
79 | orr tmp2, data1, #REP8_7f | ||
80 | sub tmp3, data2, zeroones | ||
81 | orr tmp4, data2, #REP8_7f | ||
82 | bic has_nul1, tmp1, tmp2 | ||
83 | bics has_nul2, tmp3, tmp4 | ||
84 | ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ | ||
85 | b.eq .Lloop | ||
86 | |||
87 | sub len, src, srcin | ||
88 | cbz has_nul1, .Lnul_in_data2 | ||
89 | CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ | ||
90 | sub len, len, #8 | ||
91 | mov has_nul2, has_nul1 | ||
92 | .Lnul_in_data2: | ||
93 | /* | ||
94 | * For big-endian, carry propagation (if the final byte in the | ||
95 | * string is 0x01) means we cannot use has_nul directly. The | ||
96 | * easiest way to get the correct byte is to byte-swap the data | ||
97 | * and calculate the syndrome a second time. | ||
98 | */ | ||
99 | CPU_BE( rev data2, data2 ) | ||
100 | CPU_BE( sub tmp1, data2, zeroones ) | ||
101 | CPU_BE( orr tmp2, data2, #REP8_7f ) | ||
102 | CPU_BE( bic has_nul2, tmp1, tmp2 ) | ||
103 | |||
104 | sub len, len, #8 | ||
105 | rev has_nul2, has_nul2 | ||
106 | clz pos, has_nul2 | ||
107 | add len, len, pos, lsr #3 /* Bits to bytes. */ | ||
108 | ret | ||
109 | |||
110 | .Lmisaligned: | ||
111 | cmp tmp1, #8 | ||
112 | neg tmp1, tmp1 | ||
113 | ldp data1, data2, [src], #16 | ||
114 | lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ | ||
115 | mov tmp2, #~0 | ||
116 | /* Big-endian. Early bytes are at MSB. */ | ||
117 | CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ | ||
118 | /* Little-endian. Early bytes are at LSB. */ | ||
119 | CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ | ||
120 | |||
121 | orr data1, data1, tmp2 | ||
122 | orr data2a, data2, tmp2 | ||
123 | csinv data1, data1, xzr, le | ||
124 | csel data2, data2, data2a, le | ||
125 | b .Lrealigned | ||
126 | ENDPROC(strlen) | ||
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 000000000000..0224cf5a5533 --- /dev/null +++ b/arch/arm64/lib/strncmp.S | |||
@@ -0,0 +1,310 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2013 ARM Ltd. | ||
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License version 2 as | ||
14 | * published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
23 | */ | ||
24 | |||
25 | #include <linux/linkage.h> | ||
26 | #include <asm/assembler.h> | ||
27 | |||
28 | /* | ||
29 | * compare two strings | ||
30 | * | ||
31 | * Parameters: | ||
32 | * x0 - const string 1 pointer | ||
33 | * x1 - const string 2 pointer | ||
34 | * x2 - the maximal length to be compared | ||
35 | * Returns: | ||
36 | * x0 - an integer less than, equal to, or greater than zero if s1 is found, | ||
37 | * respectively, to be less than, to match, or be greater than s2. | ||
38 | */ | ||
39 | |||
40 | #define REP8_01 0x0101010101010101 | ||
41 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||
42 | #define REP8_80 0x8080808080808080 | ||
43 | |||
44 | /* Parameters and result. */ | ||
45 | src1 .req x0 | ||
46 | src2 .req x1 | ||
47 | limit .req x2 | ||
48 | result .req x0 | ||
49 | |||
50 | /* Internal variables. */ | ||
51 | data1 .req x3 | ||
52 | data1w .req w3 | ||
53 | data2 .req x4 | ||
54 | data2w .req w4 | ||
55 | has_nul .req x5 | ||
56 | diff .req x6 | ||
57 | syndrome .req x7 | ||
58 | tmp1 .req x8 | ||
59 | tmp2 .req x9 | ||
60 | tmp3 .req x10 | ||
61 | zeroones .req x11 | ||
62 | pos .req x12 | ||
63 | limit_wd .req x13 | ||
64 | mask .req x14 | ||
65 | endloop .req x15 | ||
66 | |||
67 | ENTRY(strncmp) | ||
68 | cbz limit, .Lret0 | ||
69 | eor tmp1, src1, src2 | ||
70 | mov zeroones, #REP8_01 | ||
71 | tst tmp1, #7 | ||
72 | b.ne .Lmisaligned8 | ||
73 | ands tmp1, src1, #7 | ||
74 | b.ne .Lmutual_align | ||
75 | /* Calculate the number of full and partial words -1. */ | ||
76 | /* | ||
77 | * when limit is mulitply of 8, if not sub 1, | ||
78 | * the judgement of last dword will wrong. | ||
79 | */ | ||
80 | sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ | ||
81 | lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ | ||
82 | |||
83 | /* | ||
84 | * NUL detection works on the principle that (X - 1) & (~X) & 0x80 | ||
85 | * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | ||
86 | * can be done in parallel across the entire word. | ||
87 | */ | ||
88 | .Lloop_aligned: | ||
89 | ldr data1, [src1], #8 | ||
90 | ldr data2, [src2], #8 | ||
91 | .Lstart_realigned: | ||
92 | subs limit_wd, limit_wd, #1 | ||
93 | sub tmp1, data1, zeroones | ||
94 | orr tmp2, data1, #REP8_7f | ||
95 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
96 | csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ | ||
97 | bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ | ||
98 | ccmp endloop, #0, #0, eq | ||
99 | b.eq .Lloop_aligned | ||
100 | |||
101 | /*Not reached the limit, must have found the end or a diff. */ | ||
102 | tbz limit_wd, #63, .Lnot_limit | ||
103 | |||
104 | /* Limit % 8 == 0 => all bytes significant. */ | ||
105 | ands limit, limit, #7 | ||
106 | b.eq .Lnot_limit | ||
107 | |||
108 | lsl limit, limit, #3 /* Bits -> bytes. */ | ||
109 | mov mask, #~0 | ||
110 | CPU_BE( lsr mask, mask, limit ) | ||
111 | CPU_LE( lsl mask, mask, limit ) | ||
112 | bic data1, data1, mask | ||
113 | bic data2, data2, mask | ||
114 | |||
115 | /* Make sure that the NUL byte is marked in the syndrome. */ | ||
116 | orr has_nul, has_nul, mask | ||
117 | |||
118 | .Lnot_limit: | ||
119 | orr syndrome, diff, has_nul | ||
120 | b .Lcal_cmpresult | ||
121 | |||
122 | .Lmutual_align: | ||
123 | /* | ||
124 | * Sources are mutually aligned, but are not currently at an | ||
125 | * alignment boundary. Round down the addresses and then mask off | ||
126 | * the bytes that precede the start point. | ||
127 | * We also need to adjust the limit calculations, but without | ||
128 | * overflowing if the limit is near ULONG_MAX. | ||
129 | */ | ||
130 | bic src1, src1, #7 | ||
131 | bic src2, src2, #7 | ||
132 | ldr data1, [src1], #8 | ||
133 | neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ | ||
134 | ldr data2, [src2], #8 | ||
135 | mov tmp2, #~0 | ||
136 | sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ | ||
137 | /* Big-endian. Early bytes are at MSB. */ | ||
138 | CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ | ||
139 | /* Little-endian. Early bytes are at LSB. */ | ||
140 | CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ | ||
141 | |||
142 | and tmp3, limit_wd, #7 | ||
143 | lsr limit_wd, limit_wd, #3 | ||
144 | /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ | ||
145 | add limit, limit, tmp1 | ||
146 | add tmp3, tmp3, tmp1 | ||
147 | orr data1, data1, tmp2 | ||
148 | orr data2, data2, tmp2 | ||
149 | add limit_wd, limit_wd, tmp3, lsr #3 | ||
150 | b .Lstart_realigned | ||
151 | |||
152 | /*when src1 offset is not equal to src2 offset...*/ | ||
153 | .Lmisaligned8: | ||
154 | cmp limit, #8 | ||
155 | b.lo .Ltiny8proc /*limit < 8... */ | ||
156 | /* | ||
157 | * Get the align offset length to compare per byte first. | ||
158 | * After this process, one string's address will be aligned.*/ | ||
159 | and tmp1, src1, #7 | ||
160 | neg tmp1, tmp1 | ||
161 | add tmp1, tmp1, #8 | ||
162 | and tmp2, src2, #7 | ||
163 | neg tmp2, tmp2 | ||
164 | add tmp2, tmp2, #8 | ||
165 | subs tmp3, tmp1, tmp2 | ||
166 | csel pos, tmp1, tmp2, hi /*Choose the maximum. */ | ||
167 | /* | ||
168 | * Here, limit is not less than 8, so directly run .Ltinycmp | ||
169 | * without checking the limit.*/ | ||
170 | sub limit, limit, pos | ||
171 | .Ltinycmp: | ||
172 | ldrb data1w, [src1], #1 | ||
173 | ldrb data2w, [src2], #1 | ||
174 | subs pos, pos, #1 | ||
175 | ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ | ||
176 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ | ||
177 | b.eq .Ltinycmp | ||
178 | cbnz pos, 1f /*find the null or unequal...*/ | ||
179 | cmp data1w, #1 | ||
180 | ccmp data1w, data2w, #0, cs | ||
181 | b.eq .Lstart_align /*the last bytes are equal....*/ | ||
182 | 1: | ||
183 | sub result, data1, data2 | ||
184 | ret | ||
185 | |||
186 | .Lstart_align: | ||
187 | lsr limit_wd, limit, #3 | ||
188 | cbz limit_wd, .Lremain8 | ||
189 | /*process more leading bytes to make str1 aligned...*/ | ||
190 | ands xzr, src1, #7 | ||
191 | b.eq .Lrecal_offset | ||
192 | add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ | ||
193 | add src2, src2, tmp3 | ||
194 | ldr data1, [src1], #8 | ||
195 | ldr data2, [src2], #8 | ||
196 | |||
197 | sub limit, limit, tmp3 | ||
198 | lsr limit_wd, limit, #3 | ||
199 | subs limit_wd, limit_wd, #1 | ||
200 | |||
201 | sub tmp1, data1, zeroones | ||
202 | orr tmp2, data1, #REP8_7f | ||
203 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
204 | csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ | ||
205 | bics has_nul, tmp1, tmp2 | ||
206 | ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ | ||
207 | b.ne .Lunequal_proc | ||
208 | /*How far is the current str2 from the alignment boundary...*/ | ||
209 | and tmp3, tmp3, #7 | ||
210 | .Lrecal_offset: | ||
211 | neg pos, tmp3 | ||
212 | .Lloopcmp_proc: | ||
213 | /* | ||
214 | * Divide the eight bytes into two parts. First,backwards the src2 | ||
215 | * to an alignment boundary,load eight bytes from the SRC2 alignment | ||
216 | * boundary,then compare with the relative bytes from SRC1. | ||
217 | * If all 8 bytes are equal,then start the second part's comparison. | ||
218 | * Otherwise finish the comparison. | ||
219 | * This special handle can garantee all the accesses are in the | ||
220 | * thread/task space in avoid to overrange access. | ||
221 | */ | ||
222 | ldr data1, [src1,pos] | ||
223 | ldr data2, [src2,pos] | ||
224 | sub tmp1, data1, zeroones | ||
225 | orr tmp2, data1, #REP8_7f | ||
226 | bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ | ||
227 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
228 | csinv endloop, diff, xzr, eq | ||
229 | cbnz endloop, .Lunequal_proc | ||
230 | |||
231 | /*The second part process*/ | ||
232 | ldr data1, [src1], #8 | ||
233 | ldr data2, [src2], #8 | ||
234 | subs limit_wd, limit_wd, #1 | ||
235 | sub tmp1, data1, zeroones | ||
236 | orr tmp2, data1, #REP8_7f | ||
237 | eor diff, data1, data2 /* Non-zero if differences found. */ | ||
238 | csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ | ||
239 | bics has_nul, tmp1, tmp2 | ||
240 | ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ | ||
241 | b.eq .Lloopcmp_proc | ||
242 | |||
243 | .Lunequal_proc: | ||
244 | orr syndrome, diff, has_nul | ||
245 | cbz syndrome, .Lremain8 | ||
246 | .Lcal_cmpresult: | ||
247 | /* | ||
248 | * reversed the byte-order as big-endian,then CLZ can find the most | ||
249 | * significant zero bits. | ||
250 | */ | ||
251 | CPU_LE( rev syndrome, syndrome ) | ||
252 | CPU_LE( rev data1, data1 ) | ||
253 | CPU_LE( rev data2, data2 ) | ||
254 | /* | ||
255 | * For big-endian we cannot use the trick with the syndrome value | ||
256 | * as carry-propagation can corrupt the upper bits if the trailing | ||
257 | * bytes in the string contain 0x01. | ||
258 | * However, if there is no NUL byte in the dword, we can generate | ||
259 | * the result directly. We can't just subtract the bytes as the | ||
260 | * MSB might be significant. | ||
261 | */ | ||
262 | CPU_BE( cbnz has_nul, 1f ) | ||
263 | CPU_BE( cmp data1, data2 ) | ||
264 | CPU_BE( cset result, ne ) | ||
265 | CPU_BE( cneg result, result, lo ) | ||
266 | CPU_BE( ret ) | ||
267 | CPU_BE( 1: ) | ||
268 | /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ | ||
269 | CPU_BE( rev tmp3, data1 ) | ||
270 | CPU_BE( sub tmp1, tmp3, zeroones ) | ||
271 | CPU_BE( orr tmp2, tmp3, #REP8_7f ) | ||
272 | CPU_BE( bic has_nul, tmp1, tmp2 ) | ||
273 | CPU_BE( rev has_nul, has_nul ) | ||
274 | CPU_BE( orr syndrome, diff, has_nul ) | ||
275 | /* | ||
276 | * The MS-non-zero bit of the syndrome marks either the first bit | ||
277 | * that is different, or the top bit of the first zero byte. | ||
278 | * Shifting left now will bring the critical information into the | ||
279 | * top bits. | ||
280 | */ | ||
281 | clz pos, syndrome | ||
282 | lsl data1, data1, pos | ||
283 | lsl data2, data2, pos | ||
284 | /* | ||
285 | * But we need to zero-extend (char is unsigned) the value and then | ||
286 | * perform a signed 32-bit subtraction. | ||
287 | */ | ||
288 | lsr data1, data1, #56 | ||
289 | sub result, data1, data2, lsr #56 | ||
290 | ret | ||
291 | |||
292 | .Lremain8: | ||
293 | /* Limit % 8 == 0 => all bytes significant. */ | ||
294 | ands limit, limit, #7 | ||
295 | b.eq .Lret0 | ||
296 | .Ltiny8proc: | ||
297 | ldrb data1w, [src1], #1 | ||
298 | ldrb data2w, [src2], #1 | ||
299 | subs limit, limit, #1 | ||
300 | |||
301 | ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ | ||
302 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ | ||
303 | b.eq .Ltiny8proc | ||
304 | sub result, data1, data2 | ||
305 | ret | ||
306 | |||
307 | .Lret0: | ||
308 | mov result, #0 | ||
309 | ret | ||
310 | ENDPROC(strncmp) | ||
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 000000000000..2ca665711bf2 --- /dev/null +++ b/arch/arm64/lib/strnlen.S | |||
@@ -0,0 +1,171 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2013 ARM Ltd. | ||
3 | * Copyright (C) 2013 Linaro. | ||
4 | * | ||
5 | * This code is based on glibc cortex strings work originally authored by Linaro | ||
6 | * and re-licensed under GPLv2 for the Linux kernel. The original code can | ||
7 | * be found @ | ||
8 | * | ||
9 | * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ | ||
10 | * files/head:/src/aarch64/ | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License version 2 as | ||
14 | * published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
23 | */ | ||
24 | |||
25 | #include <linux/linkage.h> | ||
26 | #include <asm/assembler.h> | ||
27 | |||
28 | /* | ||
29 | * determine the length of a fixed-size string | ||
30 | * | ||
31 | * Parameters: | ||
32 | * x0 - const string pointer | ||
33 | * x1 - maximal string length | ||
34 | * Returns: | ||
35 | * x0 - the return length of specific string | ||
36 | */ | ||
37 | |||
38 | /* Arguments and results. */ | ||
39 | srcin .req x0 | ||
40 | len .req x0 | ||
41 | limit .req x1 | ||
42 | |||
43 | /* Locals and temporaries. */ | ||
44 | src .req x2 | ||
45 | data1 .req x3 | ||
46 | data2 .req x4 | ||
47 | data2a .req x5 | ||
48 | has_nul1 .req x6 | ||
49 | has_nul2 .req x7 | ||
50 | tmp1 .req x8 | ||
51 | tmp2 .req x9 | ||
52 | tmp3 .req x10 | ||
53 | tmp4 .req x11 | ||
54 | zeroones .req x12 | ||
55 | pos .req x13 | ||
56 | limit_wd .req x14 | ||
57 | |||
58 | #define REP8_01 0x0101010101010101 | ||
59 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||
60 | #define REP8_80 0x8080808080808080 | ||
61 | |||
62 | ENTRY(strnlen) | ||
63 | cbz limit, .Lhit_limit | ||
64 | mov zeroones, #REP8_01 | ||
65 | bic src, srcin, #15 | ||
66 | ands tmp1, srcin, #15 | ||
67 | b.ne .Lmisaligned | ||
68 | /* Calculate the number of full and partial words -1. */ | ||
69 | sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ | ||
70 | lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ | ||
71 | |||
72 | /* | ||
73 | * NUL detection works on the principle that (X - 1) & (~X) & 0x80 | ||
74 | * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | ||
75 | * can be done in parallel across the entire word. | ||
76 | */ | ||
77 | /* | ||
78 | * The inner loop deals with two Dwords at a time. This has a | ||
79 | * slightly higher start-up cost, but we should win quite quickly, | ||
80 | * especially on cores with a high number of issue slots per | ||
81 | * cycle, as we get much better parallelism out of the operations. | ||
82 | */ | ||
83 | .Lloop: | ||
84 | ldp data1, data2, [src], #16 | ||
85 | .Lrealigned: | ||
86 | sub tmp1, data1, zeroones | ||
87 | orr tmp2, data1, #REP8_7f | ||
88 | sub tmp3, data2, zeroones | ||
89 | orr tmp4, data2, #REP8_7f | ||
90 | bic has_nul1, tmp1, tmp2 | ||
91 | bic has_nul2, tmp3, tmp4 | ||
92 | subs limit_wd, limit_wd, #1 | ||
93 | orr tmp1, has_nul1, has_nul2 | ||
94 | ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ | ||
95 | b.eq .Lloop | ||
96 | |||
97 | cbz tmp1, .Lhit_limit /* No null in final Qword. */ | ||
98 | |||
99 | /* | ||
100 | * We know there's a null in the final Qword. The easiest thing | ||
101 | * to do now is work out the length of the string and return | ||
102 | * MIN (len, limit). | ||
103 | */ | ||
104 | sub len, src, srcin | ||
105 | cbz has_nul1, .Lnul_in_data2 | ||
106 | CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ | ||
107 | |||
108 | sub len, len, #8 | ||
109 | mov has_nul2, has_nul1 | ||
110 | .Lnul_in_data2: | ||
111 | /* | ||
112 | * For big-endian, carry propagation (if the final byte in the | ||
113 | * string is 0x01) means we cannot use has_nul directly. The | ||
114 | * easiest way to get the correct byte is to byte-swap the data | ||
115 | * and calculate the syndrome a second time. | ||
116 | */ | ||
117 | CPU_BE( rev data2, data2 ) | ||
118 | CPU_BE( sub tmp1, data2, zeroones ) | ||
119 | CPU_BE( orr tmp2, data2, #REP8_7f ) | ||
120 | CPU_BE( bic has_nul2, tmp1, tmp2 ) | ||
121 | |||
122 | sub len, len, #8 | ||
123 | rev has_nul2, has_nul2 | ||
124 | clz pos, has_nul2 | ||
125 | add len, len, pos, lsr #3 /* Bits to bytes. */ | ||
126 | cmp len, limit | ||
127 | csel len, len, limit, ls /* Return the lower value. */ | ||
128 | ret | ||
129 | |||
130 | .Lmisaligned: | ||
131 | /* | ||
132 | * Deal with a partial first word. | ||
133 | * We're doing two things in parallel here; | ||
134 | * 1) Calculate the number of words (but avoiding overflow if | ||
135 | * limit is near ULONG_MAX) - to do this we need to work out | ||
136 | * limit + tmp1 - 1 as a 65-bit value before shifting it; | ||
137 | * 2) Load and mask the initial data words - we force the bytes | ||
138 | * before the ones we are interested in to 0xff - this ensures | ||
139 | * early bytes will not hit any zero detection. | ||
140 | */ | ||
141 | ldp data1, data2, [src], #16 | ||
142 | |||
143 | sub limit_wd, limit, #1 | ||
144 | and tmp3, limit_wd, #15 | ||
145 | lsr limit_wd, limit_wd, #4 | ||
146 | |||
147 | add tmp3, tmp3, tmp1 | ||
148 | add limit_wd, limit_wd, tmp3, lsr #4 | ||
149 | |||
150 | neg tmp4, tmp1 | ||
151 | lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ | ||
152 | |||
153 | mov tmp2, #~0 | ||
154 | /* Big-endian. Early bytes are at MSB. */ | ||
155 | CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ | ||
156 | /* Little-endian. Early bytes are at LSB. */ | ||
157 | CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ | ||
158 | |||
159 | cmp tmp1, #8 | ||
160 | |||
161 | orr data1, data1, tmp2 | ||
162 | orr data2a, data2, tmp2 | ||
163 | |||
164 | csinv data1, data1, xzr, le | ||
165 | csel data2, data2, data2a, le | ||
166 | b .Lrealigned | ||
167 | |||
168 | .Lhit_limit: | ||
169 | mov len, limit | ||
170 | ret | ||
171 | ENDPROC(strnlen) | ||
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index b51d36401d83..3ecb56c624d3 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | obj-y := dma-mapping.o extable.o fault.o init.o \ | 1 | obj-y := dma-mapping.o extable.o fault.o init.o \ |
2 | cache.o copypage.o flush.o \ | 2 | cache.o copypage.o flush.o \ |
3 | ioremap.o mmap.o pgd.o mmu.o \ | 3 | ioremap.o mmap.o pgd.o mmu.o \ |
4 | context.o tlb.o proc.o | 4 | context.o proc.o |
5 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | 5 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o |
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index fda756875fa6..23663837acff 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S | |||
@@ -31,7 +31,7 @@ | |||
31 | * Corrupted registers: x0-x7, x9-x11 | 31 | * Corrupted registers: x0-x7, x9-x11 |
32 | */ | 32 | */ |
33 | __flush_dcache_all: | 33 | __flush_dcache_all: |
34 | dsb sy // ensure ordering with previous memory accesses | 34 | dmb sy // ensure ordering with previous memory accesses |
35 | mrs x0, clidr_el1 // read clidr | 35 | mrs x0, clidr_el1 // read clidr |
36 | and x3, x0, #0x7000000 // extract loc from clidr | 36 | and x3, x0, #0x7000000 // extract loc from clidr |
37 | lsr x3, x3, #23 // left align loc bit field | 37 | lsr x3, x3, #23 // left align loc bit field |
@@ -128,7 +128,7 @@ USER(9f, dc cvau, x4 ) // clean D line to PoU | |||
128 | add x4, x4, x2 | 128 | add x4, x4, x2 |
129 | cmp x4, x1 | 129 | cmp x4, x1 |
130 | b.lo 1b | 130 | b.lo 1b |
131 | dsb sy | 131 | dsb ish |
132 | 132 | ||
133 | icache_line_size x2, x3 | 133 | icache_line_size x2, x3 |
134 | sub x3, x2, #1 | 134 | sub x3, x2, #1 |
@@ -139,7 +139,7 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU | |||
139 | cmp x4, x1 | 139 | cmp x4, x1 |
140 | b.lo 1b | 140 | b.lo 1b |
141 | 9: // ignore any faulting cache operation | 141 | 9: // ignore any faulting cache operation |
142 | dsb sy | 142 | dsb ish |
143 | isb | 143 | isb |
144 | ret | 144 | ret |
145 | ENDPROC(flush_icache_range) | 145 | ENDPROC(flush_icache_range) |
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index c851eb44dc50..4164c5ace9f8 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c | |||
@@ -115,7 +115,7 @@ static void *__dma_alloc_noncoherent(struct device *dev, size_t size, | |||
115 | for (i = 0; i < (size >> PAGE_SHIFT); i++) | 115 | for (i = 0; i < (size >> PAGE_SHIFT); i++) |
116 | map[i] = page + i; | 116 | map[i] = page + i; |
117 | coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, | 117 | coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, |
118 | __get_dma_pgprot(attrs, pgprot_default, false)); | 118 | __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false)); |
119 | kfree(map); | 119 | kfree(map); |
120 | if (!coherent_ptr) | 120 | if (!coherent_ptr) |
121 | goto no_map; | 121 | goto no_map; |
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c23751b06120..bcc965e2cce1 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c | |||
@@ -32,6 +32,7 @@ | |||
32 | 32 | ||
33 | #include <asm/exception.h> | 33 | #include <asm/exception.h> |
34 | #include <asm/debug-monitors.h> | 34 | #include <asm/debug-monitors.h> |
35 | #include <asm/esr.h> | ||
35 | #include <asm/system_misc.h> | 36 | #include <asm/system_misc.h> |
36 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
37 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
@@ -123,6 +124,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr, | |||
123 | } | 124 | } |
124 | 125 | ||
125 | tsk->thread.fault_address = addr; | 126 | tsk->thread.fault_address = addr; |
127 | tsk->thread.fault_code = esr; | ||
126 | si.si_signo = sig; | 128 | si.si_signo = sig; |
127 | si.si_errno = 0; | 129 | si.si_errno = 0; |
128 | si.si_code = code; | 130 | si.si_code = code; |
@@ -148,8 +150,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re | |||
148 | #define VM_FAULT_BADMAP 0x010000 | 150 | #define VM_FAULT_BADMAP 0x010000 |
149 | #define VM_FAULT_BADACCESS 0x020000 | 151 | #define VM_FAULT_BADACCESS 0x020000 |
150 | 152 | ||
151 | #define ESR_WRITE (1 << 6) | ||
152 | #define ESR_CM (1 << 8) | ||
153 | #define ESR_LNX_EXEC (1 << 24) | 153 | #define ESR_LNX_EXEC (1 << 24) |
154 | 154 | ||
155 | static int __do_page_fault(struct mm_struct *mm, unsigned long addr, | 155 | static int __do_page_fault(struct mm_struct *mm, unsigned long addr, |
@@ -218,7 +218,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, | |||
218 | 218 | ||
219 | if (esr & ESR_LNX_EXEC) { | 219 | if (esr & ESR_LNX_EXEC) { |
220 | vm_flags = VM_EXEC; | 220 | vm_flags = VM_EXEC; |
221 | } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { | 221 | } else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) { |
222 | vm_flags = VM_WRITE; | 222 | vm_flags = VM_WRITE; |
223 | mm_flags |= FAULT_FLAG_WRITE; | 223 | mm_flags |= FAULT_FLAG_WRITE; |
224 | } | 224 | } |
@@ -525,7 +525,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr, | |||
525 | info.si_errno = 0; | 525 | info.si_errno = 0; |
526 | info.si_code = inf->code; | 526 | info.si_code = inf->code; |
527 | info.si_addr = (void __user *)addr; | 527 | info.si_addr = (void __user *)addr; |
528 | arm64_notify_die("", regs, &info, esr); | 528 | arm64_notify_die("", regs, &info, 0); |
529 | 529 | ||
530 | return 0; | 530 | return 0; |
531 | } | 531 | } |
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4a829a210bb6..c43f1dd19489 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c | |||
@@ -43,11 +43,6 @@ | |||
43 | struct page *empty_zero_page; | 43 | struct page *empty_zero_page; |
44 | EXPORT_SYMBOL(empty_zero_page); | 44 | EXPORT_SYMBOL(empty_zero_page); |
45 | 45 | ||
46 | pgprot_t pgprot_default; | ||
47 | EXPORT_SYMBOL(pgprot_default); | ||
48 | |||
49 | static pmdval_t prot_sect_kernel; | ||
50 | |||
51 | struct cachepolicy { | 46 | struct cachepolicy { |
52 | const char policy[16]; | 47 | const char policy[16]; |
53 | u64 mair; | 48 | u64 mair; |
@@ -122,33 +117,6 @@ static int __init early_cachepolicy(char *p) | |||
122 | } | 117 | } |
123 | early_param("cachepolicy", early_cachepolicy); | 118 | early_param("cachepolicy", early_cachepolicy); |
124 | 119 | ||
125 | /* | ||
126 | * Adjust the PMD section entries according to the CPU in use. | ||
127 | */ | ||
128 | void __init init_mem_pgprot(void) | ||
129 | { | ||
130 | pteval_t default_pgprot; | ||
131 | int i; | ||
132 | |||
133 | default_pgprot = PTE_ATTRINDX(MT_NORMAL); | ||
134 | prot_sect_kernel = PMD_TYPE_SECT | PMD_SECT_AF | PMD_ATTRINDX(MT_NORMAL); | ||
135 | |||
136 | #ifdef CONFIG_SMP | ||
137 | /* | ||
138 | * Mark memory with the "shared" attribute for SMP systems | ||
139 | */ | ||
140 | default_pgprot |= PTE_SHARED; | ||
141 | prot_sect_kernel |= PMD_SECT_S; | ||
142 | #endif | ||
143 | |||
144 | for (i = 0; i < 16; i++) { | ||
145 | unsigned long v = pgprot_val(protection_map[i]); | ||
146 | protection_map[i] = __pgprot(v | default_pgprot); | ||
147 | } | ||
148 | |||
149 | pgprot_default = __pgprot(PTE_TYPE_PAGE | PTE_AF | default_pgprot); | ||
150 | } | ||
151 | |||
152 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | 120 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, |
153 | unsigned long size, pgprot_t vma_prot) | 121 | unsigned long size, pgprot_t vma_prot) |
154 | { | 122 | { |
@@ -196,11 +164,10 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, | |||
196 | pgprot_t prot_pte; | 164 | pgprot_t prot_pte; |
197 | 165 | ||
198 | if (map_io) { | 166 | if (map_io) { |
199 | prot_sect = PMD_TYPE_SECT | PMD_SECT_AF | | 167 | prot_sect = PROT_SECT_DEVICE_nGnRE; |
200 | PMD_ATTRINDX(MT_DEVICE_nGnRE); | ||
201 | prot_pte = __pgprot(PROT_DEVICE_nGnRE); | 168 | prot_pte = __pgprot(PROT_DEVICE_nGnRE); |
202 | } else { | 169 | } else { |
203 | prot_sect = prot_sect_kernel; | 170 | prot_sect = PROT_SECT_NORMAL_EXEC; |
204 | prot_pte = PAGE_KERNEL_EXEC; | 171 | prot_pte = PAGE_KERNEL_EXEC; |
205 | } | 172 | } |
206 | 173 | ||
@@ -242,7 +209,30 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, | |||
242 | 209 | ||
243 | do { | 210 | do { |
244 | next = pud_addr_end(addr, end); | 211 | next = pud_addr_end(addr, end); |
245 | alloc_init_pmd(pud, addr, next, phys, map_io); | 212 | |
213 | /* | ||
214 | * For 4K granule only, attempt to put down a 1GB block | ||
215 | */ | ||
216 | if (!map_io && (PAGE_SHIFT == 12) && | ||
217 | ((addr | next | phys) & ~PUD_MASK) == 0) { | ||
218 | pud_t old_pud = *pud; | ||
219 | set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC)); | ||
220 | |||
221 | /* | ||
222 | * If we have an old value for a pud, it will | ||
223 | * be pointing to a pmd table that we no longer | ||
224 | * need (from swapper_pg_dir). | ||
225 | * | ||
226 | * Look up the old pmd table and free it. | ||
227 | */ | ||
228 | if (!pud_none(old_pud)) { | ||
229 | phys_addr_t table = __pa(pmd_offset(&old_pud, 0)); | ||
230 | memblock_free(table, PAGE_SIZE); | ||
231 | flush_tlb_all(); | ||
232 | } | ||
233 | } else { | ||
234 | alloc_init_pmd(pud, addr, next, phys, map_io); | ||
235 | } | ||
246 | phys += next - addr; | 236 | phys += next - addr; |
247 | } while (pud++, addr = next, addr != end); | 237 | } while (pud++, addr = next, addr != end); |
248 | } | 238 | } |
@@ -399,6 +389,9 @@ int kern_addr_valid(unsigned long addr) | |||
399 | if (pud_none(*pud)) | 389 | if (pud_none(*pud)) |
400 | return 0; | 390 | return 0; |
401 | 391 | ||
392 | if (pud_sect(*pud)) | ||
393 | return pfn_valid(pud_pfn(*pud)); | ||
394 | |||
402 | pmd = pmd_offset(pud, addr); | 395 | pmd = pmd_offset(pud, addr); |
403 | if (pmd_none(*pmd)) | 396 | if (pmd_none(*pmd)) |
404 | return 0; | 397 | return 0; |
@@ -446,7 +439,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) | |||
446 | if (!p) | 439 | if (!p) |
447 | return -ENOMEM; | 440 | return -ENOMEM; |
448 | 441 | ||
449 | set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel)); | 442 | set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL)); |
450 | } else | 443 | } else |
451 | vmemmap_verify((pte_t *)pmd, node, addr, next); | 444 | vmemmap_verify((pte_t *)pmd, node, addr, next); |
452 | } while (addr = next, addr != end); | 445 | } while (addr = next, addr != end); |
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 9042aff5e9e3..7736779c9809 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S | |||
@@ -182,7 +182,7 @@ ENDPROC(cpu_do_switch_mm) | |||
182 | ENTRY(__cpu_setup) | 182 | ENTRY(__cpu_setup) |
183 | ic iallu // I+BTB cache invalidate | 183 | ic iallu // I+BTB cache invalidate |
184 | tlbi vmalle1is // invalidate I + D TLBs | 184 | tlbi vmalle1is // invalidate I + D TLBs |
185 | dsb sy | 185 | dsb ish |
186 | 186 | ||
187 | mov x0, #3 << 20 | 187 | mov x0, #3 << 20 |
188 | msr cpacr_el1, x0 // Enable FP/ASIMD | 188 | msr cpacr_el1, x0 // Enable FP/ASIMD |
diff --git a/arch/arm64/mm/tlb.S b/arch/arm64/mm/tlb.S deleted file mode 100644 index 19da91e0cd27..000000000000 --- a/arch/arm64/mm/tlb.S +++ /dev/null | |||
@@ -1,71 +0,0 @@ | |||
1 | /* | ||
2 | * Based on arch/arm/mm/tlb.S | ||
3 | * | ||
4 | * Copyright (C) 1997-2002 Russell King | ||
5 | * Copyright (C) 2012 ARM Ltd. | ||
6 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License version 2 as | ||
10 | * published by the Free Software Foundation. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
19 | */ | ||
20 | #include <linux/linkage.h> | ||
21 | #include <asm/assembler.h> | ||
22 | #include <asm/asm-offsets.h> | ||
23 | #include <asm/page.h> | ||
24 | #include <asm/tlbflush.h> | ||
25 | #include "proc-macros.S" | ||
26 | |||
27 | /* | ||
28 | * __cpu_flush_user_tlb_range(start, end, vma) | ||
29 | * | ||
30 | * Invalidate a range of TLB entries in the specified address space. | ||
31 | * | ||
32 | * - start - start address (may not be aligned) | ||
33 | * - end - end address (exclusive, may not be aligned) | ||
34 | * - vma - vma_struct describing address range | ||
35 | */ | ||
36 | ENTRY(__cpu_flush_user_tlb_range) | ||
37 | vma_vm_mm x3, x2 // get vma->vm_mm | ||
38 | mmid w3, x3 // get vm_mm->context.id | ||
39 | dsb sy | ||
40 | lsr x0, x0, #12 // align address | ||
41 | lsr x1, x1, #12 | ||
42 | bfi x0, x3, #48, #16 // start VA and ASID | ||
43 | bfi x1, x3, #48, #16 // end VA and ASID | ||
44 | 1: tlbi vae1is, x0 // TLB invalidate by address and ASID | ||
45 | add x0, x0, #1 | ||
46 | cmp x0, x1 | ||
47 | b.lo 1b | ||
48 | dsb sy | ||
49 | ret | ||
50 | ENDPROC(__cpu_flush_user_tlb_range) | ||
51 | |||
52 | /* | ||
53 | * __cpu_flush_kern_tlb_range(start,end) | ||
54 | * | ||
55 | * Invalidate a range of kernel TLB entries. | ||
56 | * | ||
57 | * - start - start address (may not be aligned) | ||
58 | * - end - end address (exclusive, may not be aligned) | ||
59 | */ | ||
60 | ENTRY(__cpu_flush_kern_tlb_range) | ||
61 | dsb sy | ||
62 | lsr x0, x0, #12 // align address | ||
63 | lsr x1, x1, #12 | ||
64 | 1: tlbi vaae1is, x0 // TLB invalidate by address | ||
65 | add x0, x0, #1 | ||
66 | cmp x0, x1 | ||
67 | b.lo 1b | ||
68 | dsb sy | ||
69 | isb | ||
70 | ret | ||
71 | ENDPROC(__cpu_flush_kern_tlb_range) | ||
diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h index 8a029505d7b7..2f1c3c2657ad 100644 --- a/arch/blackfin/include/asm/ftrace.h +++ b/arch/blackfin/include/asm/ftrace.h | |||
@@ -66,16 +66,7 @@ extern inline void *return_address(unsigned int level) | |||
66 | 66 | ||
67 | #endif /* CONFIG_FRAME_POINTER */ | 67 | #endif /* CONFIG_FRAME_POINTER */ |
68 | 68 | ||
69 | #define HAVE_ARCH_CALLER_ADDR | 69 | #define ftrace_return_address(n) return_address(n) |
70 | |||
71 | /* inline function or macro may lead to unexpected result */ | ||
72 | #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) | ||
73 | #define CALLER_ADDR1 ((unsigned long)return_address(1)) | ||
74 | #define CALLER_ADDR2 ((unsigned long)return_address(2)) | ||
75 | #define CALLER_ADDR3 ((unsigned long)return_address(3)) | ||
76 | #define CALLER_ADDR4 ((unsigned long)return_address(4)) | ||
77 | #define CALLER_ADDR5 ((unsigned long)return_address(5)) | ||
78 | #define CALLER_ADDR6 ((unsigned long)return_address(6)) | ||
79 | 70 | ||
80 | #endif /* __ASSEMBLY__ */ | 71 | #endif /* __ASSEMBLY__ */ |
81 | 72 | ||
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h index 72c0fafaa039..544ed8ef87eb 100644 --- a/arch/parisc/include/asm/ftrace.h +++ b/arch/parisc/include/asm/ftrace.h | |||
@@ -24,15 +24,7 @@ extern void return_to_handler(void); | |||
24 | 24 | ||
25 | extern unsigned long return_address(unsigned int); | 25 | extern unsigned long return_address(unsigned int); |
26 | 26 | ||
27 | #define HAVE_ARCH_CALLER_ADDR | 27 | #define ftrace_return_address(n) return_address(n) |
28 | |||
29 | #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) | ||
30 | #define CALLER_ADDR1 return_address(1) | ||
31 | #define CALLER_ADDR2 return_address(2) | ||
32 | #define CALLER_ADDR3 return_address(3) | ||
33 | #define CALLER_ADDR4 return_address(4) | ||
34 | #define CALLER_ADDR5 return_address(5) | ||
35 | #define CALLER_ADDR6 return_address(6) | ||
36 | 28 | ||
37 | #endif /* __ASSEMBLY__ */ | 29 | #endif /* __ASSEMBLY__ */ |
38 | 30 | ||
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 13e9966464c2..e79fb6ebaa42 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h | |||
@@ -40,15 +40,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) | |||
40 | /* arch/sh/kernel/return_address.c */ | 40 | /* arch/sh/kernel/return_address.c */ |
41 | extern void *return_address(unsigned int); | 41 | extern void *return_address(unsigned int); |
42 | 42 | ||
43 | #define HAVE_ARCH_CALLER_ADDR | 43 | #define ftrace_return_address(n) return_address(n) |
44 | |||
45 | #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) | ||
46 | #define CALLER_ADDR1 ((unsigned long)return_address(1)) | ||
47 | #define CALLER_ADDR2 ((unsigned long)return_address(2)) | ||
48 | #define CALLER_ADDR3 ((unsigned long)return_address(3)) | ||
49 | #define CALLER_ADDR4 ((unsigned long)return_address(4)) | ||
50 | #define CALLER_ADDR5 ((unsigned long)return_address(5)) | ||
51 | #define CALLER_ADDR6 ((unsigned long)return_address(6)) | ||
52 | 44 | ||
53 | #endif /* __ASSEMBLY__ */ | 45 | #endif /* __ASSEMBLY__ */ |
54 | 46 | ||
diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h index 736b9d214d80..6c6d9a9f185f 100644 --- a/arch/xtensa/include/asm/ftrace.h +++ b/arch/xtensa/include/asm/ftrace.h | |||
@@ -12,24 +12,18 @@ | |||
12 | 12 | ||
13 | #include <asm/processor.h> | 13 | #include <asm/processor.h> |
14 | 14 | ||
15 | #define HAVE_ARCH_CALLER_ADDR | ||
16 | #ifndef __ASSEMBLY__ | 15 | #ifndef __ASSEMBLY__ |
17 | #define CALLER_ADDR0 ({ unsigned long a0, a1; \ | 16 | #define ftrace_return_address0 ({ unsigned long a0, a1; \ |
18 | __asm__ __volatile__ ( \ | 17 | __asm__ __volatile__ ( \ |
19 | "mov %0, a0\n" \ | 18 | "mov %0, a0\n" \ |
20 | "mov %1, a1\n" \ | 19 | "mov %1, a1\n" \ |
21 | : "=r"(a0), "=r"(a1)); \ | 20 | : "=r"(a0), "=r"(a1)); \ |
22 | MAKE_PC_FROM_RA(a0, a1); }) | 21 | MAKE_PC_FROM_RA(a0, a1); }) |
22 | |||
23 | #ifdef CONFIG_FRAME_POINTER | 23 | #ifdef CONFIG_FRAME_POINTER |
24 | extern unsigned long return_address(unsigned level); | 24 | extern unsigned long return_address(unsigned level); |
25 | #define CALLER_ADDR1 return_address(1) | 25 | #define ftrace_return_address(n) return_address(n) |
26 | #define CALLER_ADDR2 return_address(2) | 26 | #endif |
27 | #define CALLER_ADDR3 return_address(3) | ||
28 | #else /* CONFIG_FRAME_POINTER */ | ||
29 | #define CALLER_ADDR1 (0) | ||
30 | #define CALLER_ADDR2 (0) | ||
31 | #define CALLER_ADDR3 (0) | ||
32 | #endif /* CONFIG_FRAME_POINTER */ | ||
33 | #endif /* __ASSEMBLY__ */ | 27 | #endif /* __ASSEMBLY__ */ |
34 | 28 | ||
35 | #ifdef CONFIG_FUNCTION_TRACER | 29 | #ifdef CONFIG_FUNCTION_TRACER |
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index 03cf5936bad6..1ac097279db1 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h | |||
@@ -4,22 +4,27 @@ | |||
4 | /* | 4 | /* |
5 | * This is the most generic implementation of unaligned accesses | 5 | * This is the most generic implementation of unaligned accesses |
6 | * and should work almost anywhere. | 6 | * and should work almost anywhere. |
7 | * | ||
8 | * If an architecture can handle unaligned accesses in hardware, | ||
9 | * it may want to use the linux/unaligned/access_ok.h implementation | ||
10 | * instead. | ||
11 | */ | 7 | */ |
12 | #include <asm/byteorder.h> | 8 | #include <asm/byteorder.h> |
13 | 9 | ||
10 | /* Set by the arch if it can handle unaligned accesses in hardware. */ | ||
11 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS | ||
12 | # include <linux/unaligned/access_ok.h> | ||
13 | #endif | ||
14 | |||
14 | #if defined(__LITTLE_ENDIAN) | 15 | #if defined(__LITTLE_ENDIAN) |
15 | # include <linux/unaligned/le_struct.h> | 16 | # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
16 | # include <linux/unaligned/be_byteshift.h> | 17 | # include <linux/unaligned/le_struct.h> |
18 | # include <linux/unaligned/be_byteshift.h> | ||
19 | # endif | ||
17 | # include <linux/unaligned/generic.h> | 20 | # include <linux/unaligned/generic.h> |
18 | # define get_unaligned __get_unaligned_le | 21 | # define get_unaligned __get_unaligned_le |
19 | # define put_unaligned __put_unaligned_le | 22 | # define put_unaligned __put_unaligned_le |
20 | #elif defined(__BIG_ENDIAN) | 23 | #elif defined(__BIG_ENDIAN) |
21 | # include <linux/unaligned/be_struct.h> | 24 | # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
22 | # include <linux/unaligned/le_byteshift.h> | 25 | # include <linux/unaligned/be_struct.h> |
26 | # include <linux/unaligned/le_byteshift.h> | ||
27 | # endif | ||
23 | # include <linux/unaligned/generic.h> | 28 | # include <linux/unaligned/generic.h> |
24 | # define get_unaligned __get_unaligned_be | 29 | # define get_unaligned __get_unaligned_be |
25 | # define put_unaligned __put_unaligned_be | 30 | # define put_unaligned __put_unaligned_be |
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ae9504b4b67d..2018751cad9e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
@@ -616,25 +616,27 @@ static inline void __ftrace_enabled_restore(int enabled) | |||
616 | #endif | 616 | #endif |
617 | } | 617 | } |
618 | 618 | ||
619 | #ifndef HAVE_ARCH_CALLER_ADDR | 619 | /* All archs should have this, but we define it for consistency */ |
620 | #ifndef ftrace_return_address0 | ||
621 | # define ftrace_return_address0 __builtin_return_address(0) | ||
622 | #endif | ||
623 | |||
624 | /* Archs may use other ways for ADDR1 and beyond */ | ||
625 | #ifndef ftrace_return_address | ||
620 | # ifdef CONFIG_FRAME_POINTER | 626 | # ifdef CONFIG_FRAME_POINTER |
621 | # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) | 627 | # define ftrace_return_address(n) __builtin_return_address(n) |
622 | # define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) | ||
623 | # define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) | ||
624 | # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) | ||
625 | # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) | ||
626 | # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) | ||
627 | # define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) | ||
628 | # else | 628 | # else |
629 | # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) | 629 | # define ftrace_return_address(n) 0UL |
630 | # define CALLER_ADDR1 0UL | ||
631 | # define CALLER_ADDR2 0UL | ||
632 | # define CALLER_ADDR3 0UL | ||
633 | # define CALLER_ADDR4 0UL | ||
634 | # define CALLER_ADDR5 0UL | ||
635 | # define CALLER_ADDR6 0UL | ||
636 | # endif | 630 | # endif |
637 | #endif /* ifndef HAVE_ARCH_CALLER_ADDR */ | 631 | #endif |
632 | |||
633 | #define CALLER_ADDR0 ((unsigned long)ftrace_return_address0) | ||
634 | #define CALLER_ADDR1 ((unsigned long)ftrace_return_address(1)) | ||
635 | #define CALLER_ADDR2 ((unsigned long)ftrace_return_address(2)) | ||
636 | #define CALLER_ADDR3 ((unsigned long)ftrace_return_address(3)) | ||
637 | #define CALLER_ADDR4 ((unsigned long)ftrace_return_address(4)) | ||
638 | #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) | ||
639 | #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) | ||
638 | 640 | ||
639 | #ifdef CONFIG_IRQSOFF_TRACER | 641 | #ifdef CONFIG_IRQSOFF_TRACER |
640 | extern void time_hardirqs_on(unsigned long a0, unsigned long a1); | 642 | extern void time_hardirqs_on(unsigned long a0, unsigned long a1); |
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 9c22317778eb..e11aa4a156d2 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c | |||
@@ -40,6 +40,11 @@ | |||
40 | #define R_METAG_NONE 3 | 40 | #define R_METAG_NONE 3 |
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | #ifndef EM_AARCH64 | ||
44 | #define EM_AARCH64 183 | ||
45 | #define R_AARCH64_ABS64 257 | ||
46 | #endif | ||
47 | |||
43 | static int fd_map; /* File descriptor for file being modified. */ | 48 | static int fd_map; /* File descriptor for file being modified. */ |
44 | static int mmap_failed; /* Boolean flag. */ | 49 | static int mmap_failed; /* Boolean flag. */ |
45 | static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */ | 50 | static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */ |
@@ -347,6 +352,8 @@ do_file(char const *const fname) | |||
347 | case EM_ARM: reltype = R_ARM_ABS32; | 352 | case EM_ARM: reltype = R_ARM_ABS32; |
348 | altmcount = "__gnu_mcount_nc"; | 353 | altmcount = "__gnu_mcount_nc"; |
349 | break; | 354 | break; |
355 | case EM_AARCH64: | ||
356 | reltype = R_AARCH64_ABS64; gpfx = '_'; break; | ||
350 | case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; | 357 | case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; |
351 | case EM_METAG: reltype = R_METAG_ADDR32; | 358 | case EM_METAG: reltype = R_METAG_ADDR32; |
352 | altmcount = "_mcount_wrapper"; | 359 | altmcount = "_mcount_wrapper"; |
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 91280b82da08..397b6b84e8c5 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl | |||
@@ -279,6 +279,11 @@ if ($arch eq "x86_64") { | |||
279 | $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" . | 279 | $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" . |
280 | "\\s+(__gnu_mcount_nc|mcount)\$"; | 280 | "\\s+(__gnu_mcount_nc|mcount)\$"; |
281 | 281 | ||
282 | } elsif ($arch eq "arm64") { | ||
283 | $alignment = 3; | ||
284 | $section_type = '%progbits'; | ||
285 | $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_AARCH64_CALL26\\s+_mcount\$"; | ||
286 | $type = ".quad"; | ||
282 | } elsif ($arch eq "ia64") { | 287 | } elsif ($arch eq "ia64") { |
283 | $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; | 288 | $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; |
284 | $type = "data8"; | 289 | $type = "data8"; |