summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/arm/include/asm/ftrace.h10
-rw-r--r--arch/arm64/Kconfig12
-rw-r--r--arch/arm64/Makefile1
-rw-r--r--arch/arm64/configs/defconfig36
-rw-r--r--arch/arm64/crypto/Kconfig53
-rw-r--r--arch/arm64/crypto/Makefile38
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-core.S222
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-glue.c297
-rw-r--r--arch/arm64/crypto/aes-ce-cipher.c155
-rw-r--r--arch/arm64/crypto/aes-ce.S133
-rw-r--r--arch/arm64/crypto/aes-glue.c446
-rw-r--r--arch/arm64/crypto/aes-modes.S532
-rw-r--r--arch/arm64/crypto/aes-neon.S382
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S95
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c155
-rw-r--r--arch/arm64/crypto/sha1-ce-core.S153
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c174
-rw-r--r--arch/arm64/crypto/sha2-ce-core.S156
-rw-r--r--arch/arm64/crypto/sha2-ce-glue.c255
-rw-r--r--arch/arm64/include/asm/Kbuild1
-rw-r--r--arch/arm64/include/asm/assembler.h23
-rw-r--r--arch/arm64/include/asm/atomic.h2
-rw-r--r--arch/arm64/include/asm/barrier.h20
-rw-r--r--arch/arm64/include/asm/cache.h13
-rw-r--r--arch/arm64/include/asm/cacheflush.h4
-rw-r--r--arch/arm64/include/asm/cachetype.h11
-rw-r--r--arch/arm64/include/asm/cmpxchg.h7
-rw-r--r--arch/arm64/include/asm/compat.h5
-rw-r--r--arch/arm64/include/asm/esr.h6
-rw-r--r--arch/arm64/include/asm/fpsimd.h23
-rw-r--r--arch/arm64/include/asm/fpsimdmacros.h35
-rw-r--r--arch/arm64/include/asm/ftrace.h59
-rw-r--r--arch/arm64/include/asm/hardirq.h2
-rw-r--r--arch/arm64/include/asm/insn.h2
-rw-r--r--arch/arm64/include/asm/io.h8
-rw-r--r--arch/arm64/include/asm/neon.h6
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h107
-rw-r--r--arch/arm64/include/asm/processor.h1
-rw-r--r--arch/arm64/include/asm/ptrace.h5
-rw-r--r--arch/arm64/include/asm/sigcontext.h31
-rw-r--r--arch/arm64/include/asm/string.h15
-rw-r--r--arch/arm64/include/asm/syscall.h1
-rw-r--r--arch/arm64/include/asm/thread_info.h17
-rw-r--r--arch/arm64/include/asm/tlbflush.h44
-rw-r--r--arch/arm64/include/asm/topology.h3
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/uapi/asm/sigcontext.h7
-rw-r--r--arch/arm64/kernel/Makefile7
-rw-r--r--arch/arm64/kernel/arm64ksyms.c9
-rw-r--r--arch/arm64/kernel/entry-fpsimd.S24
-rw-r--r--arch/arm64/kernel/entry-ftrace.S218
-rw-r--r--arch/arm64/kernel/entry.S90
-rw-r--r--arch/arm64/kernel/fpsimd.c186
-rw-r--r--arch/arm64/kernel/ftrace.c176
-rw-r--r--arch/arm64/kernel/head.S8
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c2
-rw-r--r--arch/arm64/kernel/process.c49
-rw-r--r--arch/arm64/kernel/ptrace.c62
-rw-r--r--arch/arm64/kernel/return_address.c55
-rw-r--r--arch/arm64/kernel/setup.c16
-rw-r--r--arch/arm64/kernel/signal.c52
-rw-r--r--arch/arm64/kernel/signal32.c16
-rw-r--r--arch/arm64/kernel/smp.c19
-rw-r--r--arch/arm64/kernel/smp_spin_table.c39
-rw-r--r--arch/arm64/kernel/stacktrace.c2
-rw-r--r--arch/arm64/kernel/time.c3
-rw-r--r--arch/arm64/kernel/topology.c212
-rw-r--r--arch/arm64/kernel/traps.c7
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S2
-rw-r--r--arch/arm64/kvm/hyp.S12
-rw-r--r--arch/arm64/kvm/sys_regs.c4
-rw-r--r--arch/arm64/lib/Makefile1
-rw-r--r--arch/arm64/lib/memcmp.S258
-rw-r--r--arch/arm64/lib/memcpy.S192
-rw-r--r--arch/arm64/lib/memmove.S190
-rw-r--r--arch/arm64/lib/memset.S207
-rw-r--r--arch/arm64/lib/strcmp.S234
-rw-r--r--arch/arm64/lib/strlen.S126
-rw-r--r--arch/arm64/lib/strncmp.S310
-rw-r--r--arch/arm64/lib/strnlen.S171
-rw-r--r--arch/arm64/mm/Makefile2
-rw-r--r--arch/arm64/mm/cache.S6
-rw-r--r--arch/arm64/mm/dma-mapping.c2
-rw-r--r--arch/arm64/mm/fault.c8
-rw-r--r--arch/arm64/mm/mmu.c67
-rw-r--r--arch/arm64/mm/proc.S2
-rw-r--r--arch/arm64/mm/tlb.S71
-rw-r--r--arch/blackfin/include/asm/ftrace.h11
-rw-r--r--arch/parisc/include/asm/ftrace.h10
-rw-r--r--arch/sh/include/asm/ftrace.h10
-rw-r--r--arch/xtensa/include/asm/ftrace.h14
-rw-r--r--include/asm-generic/unaligned.h21
-rw-r--r--include/linux/ftrace.h34
-rw-r--r--scripts/recordmcount.c7
-rwxr-xr-xscripts/recordmcount.pl5
96 files changed, 6391 insertions, 605 deletions
diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index f89515adac60..eb577f4f5f70 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -52,15 +52,7 @@ extern inline void *return_address(unsigned int level)
52 52
53#endif 53#endif
54 54
55#define HAVE_ARCH_CALLER_ADDR 55#define ftrace_return_addr(n) return_address(n)
56
57#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
58#define CALLER_ADDR1 ((unsigned long)return_address(1))
59#define CALLER_ADDR2 ((unsigned long)return_address(2))
60#define CALLER_ADDR3 ((unsigned long)return_address(3))
61#define CALLER_ADDR4 ((unsigned long)return_address(4))
62#define CALLER_ADDR5 ((unsigned long)return_address(5))
63#define CALLER_ADDR6 ((unsigned long)return_address(6))
64 56
65#endif /* ifndef __ASSEMBLY__ */ 57#endif /* ifndef __ASSEMBLY__ */
66 58
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e384ab9b3862..7295419165e1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -30,12 +30,17 @@ config ARM64
30 select HAVE_ARCH_JUMP_LABEL 30 select HAVE_ARCH_JUMP_LABEL
31 select HAVE_ARCH_KGDB 31 select HAVE_ARCH_KGDB
32 select HAVE_ARCH_TRACEHOOK 32 select HAVE_ARCH_TRACEHOOK
33 select HAVE_C_RECORDMCOUNT
33 select HAVE_DEBUG_BUGVERBOSE 34 select HAVE_DEBUG_BUGVERBOSE
34 select HAVE_DEBUG_KMEMLEAK 35 select HAVE_DEBUG_KMEMLEAK
35 select HAVE_DMA_API_DEBUG 36 select HAVE_DMA_API_DEBUG
36 select HAVE_DMA_ATTRS 37 select HAVE_DMA_ATTRS
37 select HAVE_DMA_CONTIGUOUS 38 select HAVE_DMA_CONTIGUOUS
39 select HAVE_DYNAMIC_FTRACE
38 select HAVE_EFFICIENT_UNALIGNED_ACCESS 40 select HAVE_EFFICIENT_UNALIGNED_ACCESS
41 select HAVE_FTRACE_MCOUNT_RECORD
42 select HAVE_FUNCTION_TRACER
43 select HAVE_FUNCTION_GRAPH_TRACER
39 select HAVE_GENERIC_DMA_COHERENT 44 select HAVE_GENERIC_DMA_COHERENT
40 select HAVE_HW_BREAKPOINT if PERF_EVENTS 45 select HAVE_HW_BREAKPOINT if PERF_EVENTS
41 select HAVE_MEMBLOCK 46 select HAVE_MEMBLOCK
@@ -43,6 +48,7 @@ config ARM64
43 select HAVE_PERF_EVENTS 48 select HAVE_PERF_EVENTS
44 select HAVE_PERF_REGS 49 select HAVE_PERF_REGS
45 select HAVE_PERF_USER_STACK_DUMP 50 select HAVE_PERF_USER_STACK_DUMP
51 select HAVE_SYSCALL_TRACEPOINTS
46 select IRQ_DOMAIN 52 select IRQ_DOMAIN
47 select MODULES_USE_ELF_RELA 53 select MODULES_USE_ELF_RELA
48 select NO_BOOTMEM 54 select NO_BOOTMEM
@@ -245,6 +251,9 @@ config ARCH_WANT_HUGE_PMD_SHARE
245config HAVE_ARCH_TRANSPARENT_HUGEPAGE 251config HAVE_ARCH_TRANSPARENT_HUGEPAGE
246 def_bool y 252 def_bool y
247 253
254config ARCH_HAS_CACHE_LINE_SIZE
255 def_bool y
256
248source "mm/Kconfig" 257source "mm/Kconfig"
249 258
250config XEN_DOM0 259config XEN_DOM0
@@ -359,5 +368,8 @@ source "arch/arm64/Kconfig.debug"
359source "security/Kconfig" 368source "security/Kconfig"
360 369
361source "crypto/Kconfig" 370source "crypto/Kconfig"
371if CRYPTO
372source "arch/arm64/crypto/Kconfig"
373endif
362 374
363source "lib/Kconfig" 375source "lib/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 2fceb71ac3b7..8185a913c5ed 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS
45core-y += arch/arm64/kernel/ arch/arm64/mm/ 45core-y += arch/arm64/kernel/ arch/arm64/mm/
46core-$(CONFIG_KVM) += arch/arm64/kvm/ 46core-$(CONFIG_KVM) += arch/arm64/kvm/
47core-$(CONFIG_XEN) += arch/arm64/xen/ 47core-$(CONFIG_XEN) += arch/arm64/xen/
48core-$(CONFIG_CRYPTO) += arch/arm64/crypto/
48libs-y := arch/arm64/lib/ $(libs-y) 49libs-y := arch/arm64/lib/ $(libs-y)
49libs-y += $(LIBGCC) 50libs-y += $(LIBGCC)
50 51
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 7959dd0ca5d5..157e1d8d9a47 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1,11 +1,11 @@
1# CONFIG_LOCALVERSION_AUTO is not set 1# CONFIG_LOCALVERSION_AUTO is not set
2# CONFIG_SWAP is not set
3CONFIG_SYSVIPC=y 2CONFIG_SYSVIPC=y
4CONFIG_POSIX_MQUEUE=y 3CONFIG_POSIX_MQUEUE=y
4CONFIG_AUDIT=y
5CONFIG_NO_HZ_IDLE=y
6CONFIG_HIGH_RES_TIMERS=y
5CONFIG_BSD_PROCESS_ACCT=y 7CONFIG_BSD_PROCESS_ACCT=y
6CONFIG_BSD_PROCESS_ACCT_V3=y 8CONFIG_BSD_PROCESS_ACCT_V3=y
7CONFIG_NO_HZ=y
8CONFIG_HIGH_RES_TIMERS=y
9CONFIG_IKCONFIG=y 9CONFIG_IKCONFIG=y
10CONFIG_IKCONFIG_PROC=y 10CONFIG_IKCONFIG_PROC=y
11CONFIG_LOG_BUF_SHIFT=14 11CONFIG_LOG_BUF_SHIFT=14
@@ -27,6 +27,7 @@ CONFIG_ARCH_VEXPRESS=y
27CONFIG_ARCH_XGENE=y 27CONFIG_ARCH_XGENE=y
28CONFIG_SMP=y 28CONFIG_SMP=y
29CONFIG_PREEMPT=y 29CONFIG_PREEMPT=y
30CONFIG_TRANSPARENT_HUGEPAGE=y
30CONFIG_CMA=y 31CONFIG_CMA=y
31CONFIG_CMDLINE="console=ttyAMA0" 32CONFIG_CMDLINE="console=ttyAMA0"
32# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set 33# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
@@ -44,7 +45,7 @@ CONFIG_IP_PNP_BOOTP=y
44CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" 45CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
45CONFIG_DEVTMPFS=y 46CONFIG_DEVTMPFS=y
46CONFIG_DMA_CMA=y 47CONFIG_DMA_CMA=y
47CONFIG_SCSI=y 48CONFIG_VIRTIO_BLK=y
48# CONFIG_SCSI_PROC_FS is not set 49# CONFIG_SCSI_PROC_FS is not set
49CONFIG_BLK_DEV_SD=y 50CONFIG_BLK_DEV_SD=y
50# CONFIG_SCSI_LOWLEVEL is not set 51# CONFIG_SCSI_LOWLEVEL is not set
@@ -56,20 +57,18 @@ CONFIG_SMC91X=y
56CONFIG_SMSC911X=y 57CONFIG_SMSC911X=y
57# CONFIG_WLAN is not set 58# CONFIG_WLAN is not set
58CONFIG_INPUT_EVDEV=y 59CONFIG_INPUT_EVDEV=y
59# CONFIG_SERIO_I8042 is not set
60# CONFIG_SERIO_SERPORT is not set 60# CONFIG_SERIO_SERPORT is not set
61CONFIG_LEGACY_PTY_COUNT=16 61CONFIG_LEGACY_PTY_COUNT=16
62CONFIG_SERIAL_8250=y 62CONFIG_SERIAL_8250=y
63CONFIG_SERIAL_8250_CONSOLE=y 63CONFIG_SERIAL_8250_CONSOLE=y
64CONFIG_SERIAL_OF_PLATFORM=y
65CONFIG_SERIAL_AMBA_PL011=y 64CONFIG_SERIAL_AMBA_PL011=y
66CONFIG_SERIAL_AMBA_PL011_CONSOLE=y 65CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
66CONFIG_SERIAL_OF_PLATFORM=y
67# CONFIG_HW_RANDOM is not set 67# CONFIG_HW_RANDOM is not set
68# CONFIG_HWMON is not set 68# CONFIG_HWMON is not set
69CONFIG_REGULATOR=y 69CONFIG_REGULATOR=y
70CONFIG_REGULATOR_FIXED_VOLTAGE=y 70CONFIG_REGULATOR_FIXED_VOLTAGE=y
71CONFIG_FB=y 71CONFIG_FB=y
72# CONFIG_VGA_CONSOLE is not set
73CONFIG_FRAMEBUFFER_CONSOLE=y 72CONFIG_FRAMEBUFFER_CONSOLE=y
74CONFIG_LOGO=y 73CONFIG_LOGO=y
75# CONFIG_LOGO_LINUX_MONO is not set 74# CONFIG_LOGO_LINUX_MONO is not set
@@ -79,27 +78,38 @@ CONFIG_USB_ISP1760_HCD=y
79CONFIG_USB_STORAGE=y 78CONFIG_USB_STORAGE=y
80CONFIG_MMC=y 79CONFIG_MMC=y
81CONFIG_MMC_ARMMMCI=y 80CONFIG_MMC_ARMMMCI=y
81CONFIG_VIRTIO_MMIO=y
82# CONFIG_IOMMU_SUPPORT is not set 82# CONFIG_IOMMU_SUPPORT is not set
83CONFIG_EXT2_FS=y 83CONFIG_EXT2_FS=y
84CONFIG_EXT3_FS=y 84CONFIG_EXT3_FS=y
85CONFIG_EXT4_FS=y
86# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 85# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
87# CONFIG_EXT3_FS_XATTR is not set 86# CONFIG_EXT3_FS_XATTR is not set
87CONFIG_EXT4_FS=y
88CONFIG_FUSE_FS=y 88CONFIG_FUSE_FS=y
89CONFIG_CUSE=y 89CONFIG_CUSE=y
90CONFIG_VFAT_FS=y 90CONFIG_VFAT_FS=y
91CONFIG_TMPFS=y 91CONFIG_TMPFS=y
92CONFIG_HUGETLBFS=y
92# CONFIG_MISC_FILESYSTEMS is not set 93# CONFIG_MISC_FILESYSTEMS is not set
93CONFIG_NFS_FS=y 94CONFIG_NFS_FS=y
94CONFIG_ROOT_NFS=y 95CONFIG_ROOT_NFS=y
95CONFIG_NLS_CODEPAGE_437=y 96CONFIG_NLS_CODEPAGE_437=y
96CONFIG_NLS_ISO8859_1=y 97CONFIG_NLS_ISO8859_1=y
97CONFIG_MAGIC_SYSRQ=y 98CONFIG_VIRTUALIZATION=y
99CONFIG_KVM=y
100CONFIG_DEBUG_INFO=y
98CONFIG_DEBUG_FS=y 101CONFIG_DEBUG_FS=y
102CONFIG_MAGIC_SYSRQ=y
99CONFIG_DEBUG_KERNEL=y 103CONFIG_DEBUG_KERNEL=y
104CONFIG_LOCKUP_DETECTOR=y
100# CONFIG_SCHED_DEBUG is not set 105# CONFIG_SCHED_DEBUG is not set
101CONFIG_DEBUG_INFO=y
102# CONFIG_FTRACE is not set 106# CONFIG_FTRACE is not set
103CONFIG_ATOMIC64_SELFTEST=y 107CONFIG_CRYPTO_ANSI_CPRNG=y
104CONFIG_VIRTIO_MMIO=y 108CONFIG_ARM64_CRYPTO=y
105CONFIG_VIRTIO_BLK=y 109CONFIG_CRYPTO_SHA1_ARM64_CE=y
110CONFIG_CRYPTO_SHA2_ARM64_CE=y
111CONFIG_CRYPTO_GHASH_ARM64_CE=y
112CONFIG_CRYPTO_AES_ARM64_CE=y
113CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
114CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
115CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
new file mode 100644
index 000000000000..5562652c5316
--- /dev/null
+++ b/arch/arm64/crypto/Kconfig
@@ -0,0 +1,53 @@
1
2menuconfig ARM64_CRYPTO
3 bool "ARM64 Accelerated Cryptographic Algorithms"
4 depends on ARM64
5 help
6 Say Y here to choose from a selection of cryptographic algorithms
7 implemented using ARM64 specific CPU features or instructions.
8
9if ARM64_CRYPTO
10
11config CRYPTO_SHA1_ARM64_CE
12 tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
13 depends on ARM64 && KERNEL_MODE_NEON
14 select CRYPTO_HASH
15
16config CRYPTO_SHA2_ARM64_CE
17 tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
18 depends on ARM64 && KERNEL_MODE_NEON
19 select CRYPTO_HASH
20
21config CRYPTO_GHASH_ARM64_CE
22 tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
23 depends on ARM64 && KERNEL_MODE_NEON
24 select CRYPTO_HASH
25
26config CRYPTO_AES_ARM64_CE
27 tristate "AES core cipher using ARMv8 Crypto Extensions"
28 depends on ARM64 && KERNEL_MODE_NEON
29 select CRYPTO_ALGAPI
30 select CRYPTO_AES
31
32config CRYPTO_AES_ARM64_CE_CCM
33 tristate "AES in CCM mode using ARMv8 Crypto Extensions"
34 depends on ARM64 && KERNEL_MODE_NEON
35 select CRYPTO_ALGAPI
36 select CRYPTO_AES
37 select CRYPTO_AEAD
38
39config CRYPTO_AES_ARM64_CE_BLK
40 tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
41 depends on ARM64 && KERNEL_MODE_NEON
42 select CRYPTO_BLKCIPHER
43 select CRYPTO_AES
44 select CRYPTO_ABLK_HELPER
45
46config CRYPTO_AES_ARM64_NEON_BLK
47 tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
48 depends on ARM64 && KERNEL_MODE_NEON
49 select CRYPTO_BLKCIPHER
50 select CRYPTO_AES
51 select CRYPTO_ABLK_HELPER
52
53endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
new file mode 100644
index 000000000000..2070a56ecc46
--- /dev/null
+++ b/arch/arm64/crypto/Makefile
@@ -0,0 +1,38 @@
1#
2# linux/arch/arm64/crypto/Makefile
3#
4# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License version 2 as
8# published by the Free Software Foundation.
9#
10
11obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
12sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
13
14obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
15sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
16
17obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
18ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
19
20obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
21CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
22
23obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
24aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
25
26obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
27aes-ce-blk-y := aes-glue-ce.o aes-ce.o
28
29obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
30aes-neon-blk-y := aes-glue-neon.o aes-neon.o
31
32AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE
33AFLAGS_aes-neon.o := -DINTERLEAVE=4
34
35CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
36
37$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
38 $(call if_changed_dep,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
new file mode 100644
index 000000000000..432e4841cd81
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -0,0 +1,222 @@
1/*
2 * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12
13 .text
14 .arch armv8-a+crypto
15
16 /*
17 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
18 * u32 *macp, u8 const rk[], u32 rounds);
19 */
20ENTRY(ce_aes_ccm_auth_data)
21 ldr w8, [x3] /* leftover from prev round? */
22 ld1 {v0.2d}, [x0] /* load mac */
23 cbz w8, 1f
24 sub w8, w8, #16
25 eor v1.16b, v1.16b, v1.16b
260: ldrb w7, [x1], #1 /* get 1 byte of input */
27 subs w2, w2, #1
28 add w8, w8, #1
29 ins v1.b[0], w7
30 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
31 beq 8f /* out of input? */
32 cbnz w8, 0b
33 eor v0.16b, v0.16b, v1.16b
341: ld1 {v3.2d}, [x4] /* load first round key */
35 prfm pldl1strm, [x1]
36 cmp w5, #12 /* which key size? */
37 add x6, x4, #16
38 sub w7, w5, #2 /* modified # of rounds */
39 bmi 2f
40 bne 5f
41 mov v5.16b, v3.16b
42 b 4f
432: mov v4.16b, v3.16b
44 ld1 {v5.2d}, [x6], #16 /* load 2nd round key */
453: aese v0.16b, v4.16b
46 aesmc v0.16b, v0.16b
474: ld1 {v3.2d}, [x6], #16 /* load next round key */
48 aese v0.16b, v5.16b
49 aesmc v0.16b, v0.16b
505: ld1 {v4.2d}, [x6], #16 /* load next round key */
51 subs w7, w7, #3
52 aese v0.16b, v3.16b
53 aesmc v0.16b, v0.16b
54 ld1 {v5.2d}, [x6], #16 /* load next round key */
55 bpl 3b
56 aese v0.16b, v4.16b
57 subs w2, w2, #16 /* last data? */
58 eor v0.16b, v0.16b, v5.16b /* final round */
59 bmi 6f
60 ld1 {v1.16b}, [x1], #16 /* load next input block */
61 eor v0.16b, v0.16b, v1.16b /* xor with mac */
62 bne 1b
636: st1 {v0.2d}, [x0] /* store mac */
64 beq 10f
65 adds w2, w2, #16
66 beq 10f
67 mov w8, w2
687: ldrb w7, [x1], #1
69 umov w6, v0.b[0]
70 eor w6, w6, w7
71 strb w6, [x0], #1
72 subs w2, w2, #1
73 beq 10f
74 ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
75 b 7b
768: mov w7, w8
77 add w8, w8, #16
789: ext v1.16b, v1.16b, v1.16b, #1
79 adds w7, w7, #1
80 bne 9b
81 eor v0.16b, v0.16b, v1.16b
82 st1 {v0.2d}, [x0]
8310: str w8, [x3]
84 ret
85ENDPROC(ce_aes_ccm_auth_data)
86
87 /*
88 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
89 * u32 rounds);
90 */
91ENTRY(ce_aes_ccm_final)
92 ld1 {v3.2d}, [x2], #16 /* load first round key */
93 ld1 {v0.2d}, [x0] /* load mac */
94 cmp w3, #12 /* which key size? */
95 sub w3, w3, #2 /* modified # of rounds */
96 ld1 {v1.2d}, [x1] /* load 1st ctriv */
97 bmi 0f
98 bne 3f
99 mov v5.16b, v3.16b
100 b 2f
1010: mov v4.16b, v3.16b
1021: ld1 {v5.2d}, [x2], #16 /* load next round key */
103 aese v0.16b, v4.16b
104 aese v1.16b, v4.16b
105 aesmc v0.16b, v0.16b
106 aesmc v1.16b, v1.16b
1072: ld1 {v3.2d}, [x2], #16 /* load next round key */
108 aese v0.16b, v5.16b
109 aese v1.16b, v5.16b
110 aesmc v0.16b, v0.16b
111 aesmc v1.16b, v1.16b
1123: ld1 {v4.2d}, [x2], #16 /* load next round key */
113 subs w3, w3, #3
114 aese v0.16b, v3.16b
115 aese v1.16b, v3.16b
116 aesmc v0.16b, v0.16b
117 aesmc v1.16b, v1.16b
118 bpl 1b
119 aese v0.16b, v4.16b
120 aese v1.16b, v4.16b
121 /* final round key cancels out */
122 eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
123 st1 {v0.2d}, [x0] /* store result */
124 ret
125ENDPROC(ce_aes_ccm_final)
126
127 .macro aes_ccm_do_crypt,enc
128 ldr x8, [x6, #8] /* load lower ctr */
129 ld1 {v0.2d}, [x5] /* load mac */
130 rev x8, x8 /* keep swabbed ctr in reg */
1310: /* outer loop */
132 ld1 {v1.1d}, [x6] /* load upper ctr */
133 prfm pldl1strm, [x1]
134 add x8, x8, #1
135 rev x9, x8
136 cmp w4, #12 /* which key size? */
137 sub w7, w4, #2 /* get modified # of rounds */
138 ins v1.d[1], x9 /* no carry in lower ctr */
139 ld1 {v3.2d}, [x3] /* load first round key */
140 add x10, x3, #16
141 bmi 1f
142 bne 4f
143 mov v5.16b, v3.16b
144 b 3f
1451: mov v4.16b, v3.16b
146 ld1 {v5.2d}, [x10], #16 /* load 2nd round key */
1472: /* inner loop: 3 rounds, 2x interleaved */
148 aese v0.16b, v4.16b
149 aese v1.16b, v4.16b
150 aesmc v0.16b, v0.16b
151 aesmc v1.16b, v1.16b
1523: ld1 {v3.2d}, [x10], #16 /* load next round key */
153 aese v0.16b, v5.16b
154 aese v1.16b, v5.16b
155 aesmc v0.16b, v0.16b
156 aesmc v1.16b, v1.16b
1574: ld1 {v4.2d}, [x10], #16 /* load next round key */
158 subs w7, w7, #3
159 aese v0.16b, v3.16b
160 aese v1.16b, v3.16b
161 aesmc v0.16b, v0.16b
162 aesmc v1.16b, v1.16b
163 ld1 {v5.2d}, [x10], #16 /* load next round key */
164 bpl 2b
165 aese v0.16b, v4.16b
166 aese v1.16b, v4.16b
167 subs w2, w2, #16
168 bmi 6f /* partial block? */
169 ld1 {v2.16b}, [x1], #16 /* load next input block */
170 .if \enc == 1
171 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
172 eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
173 .else
174 eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */
175 eor v1.16b, v2.16b, v5.16b /* final round enc */
176 .endif
177 eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
178 st1 {v1.16b}, [x0], #16 /* write output block */
179 bne 0b
180 rev x8, x8
181 st1 {v0.2d}, [x5] /* store mac */
182 str x8, [x6, #8] /* store lsb end of ctr (BE) */
1835: ret
184
1856: eor v0.16b, v0.16b, v5.16b /* final round mac */
186 eor v1.16b, v1.16b, v5.16b /* final round enc */
187 st1 {v0.2d}, [x5] /* store mac */
188 add w2, w2, #16 /* process partial tail block */
1897: ldrb w9, [x1], #1 /* get 1 byte of input */
190 umov w6, v1.b[0] /* get top crypted ctr byte */
191 umov w7, v0.b[0] /* get top mac byte */
192 .if \enc == 1
193 eor w7, w7, w9
194 eor w9, w9, w6
195 .else
196 eor w9, w9, w6
197 eor w7, w7, w9
198 .endif
199 strb w9, [x0], #1 /* store out byte */
200 strb w7, [x5], #1 /* store mac byte */
201 subs w2, w2, #1
202 beq 5b
203 ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
204 ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
205 b 7b
206 .endm
207
208 /*
209 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
210 * u8 const rk[], u32 rounds, u8 mac[],
211 * u8 ctr[]);
212 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
213 * u8 const rk[], u32 rounds, u8 mac[],
214 * u8 ctr[]);
215 */
216ENTRY(ce_aes_ccm_encrypt)
217 aes_ccm_do_crypt 1
218ENDPROC(ce_aes_ccm_encrypt)
219
220ENTRY(ce_aes_ccm_decrypt)
221 aes_ccm_do_crypt 0
222ENDPROC(ce_aes_ccm_decrypt)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
new file mode 100644
index 000000000000..9e6cdde9b43d
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -0,0 +1,297 @@
1/*
2 * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/aes.h>
14#include <crypto/algapi.h>
15#include <crypto/scatterwalk.h>
16#include <linux/crypto.h>
17#include <linux/module.h>
18
19static int num_rounds(struct crypto_aes_ctx *ctx)
20{
21 /*
22 * # of rounds specified by AES:
23 * 128 bit key 10 rounds
24 * 192 bit key 12 rounds
25 * 256 bit key 14 rounds
26 * => n byte key => 6 + (n/4) rounds
27 */
28 return 6 + ctx->key_length / 4;
29}
30
31asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
32 u32 *macp, u32 const rk[], u32 rounds);
33
34asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
35 u32 const rk[], u32 rounds, u8 mac[],
36 u8 ctr[]);
37
38asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
39 u32 const rk[], u32 rounds, u8 mac[],
40 u8 ctr[]);
41
42asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
43 u32 rounds);
44
45static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
46 unsigned int key_len)
47{
48 struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
49 int ret;
50
51 ret = crypto_aes_expand_key(ctx, in_key, key_len);
52 if (!ret)
53 return 0;
54
55 tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
56 return -EINVAL;
57}
58
59static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
60{
61 if ((authsize & 1) || authsize < 4)
62 return -EINVAL;
63 return 0;
64}
65
66static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
67{
68 struct crypto_aead *aead = crypto_aead_reqtfm(req);
69 __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8];
70 u32 l = req->iv[0] + 1;
71
72 /* verify that CCM dimension 'L' is set correctly in the IV */
73 if (l < 2 || l > 8)
74 return -EINVAL;
75
76 /* verify that msglen can in fact be represented in L bytes */
77 if (l < 4 && msglen >> (8 * l))
78 return -EOVERFLOW;
79
80 /*
81 * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
82 * uses a u32 type to represent msglen so the top 4 bytes are always 0.
83 */
84 n[0] = 0;
85 n[1] = cpu_to_be32(msglen);
86
87 memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
88
89 /*
90 * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C)
91 * - bits 0..2 : max # of bytes required to represent msglen, minus 1
92 * (already set by caller)
93 * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc)
94 * - bit 6 : indicates presence of authenticate-only data
95 */
96 maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
97 if (req->assoclen)
98 maciv[0] |= 0x40;
99
100 memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
101 return 0;
102}
103
104static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
105{
106 struct crypto_aead *aead = crypto_aead_reqtfm(req);
107 struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
108 struct __packed { __be16 l; __be32 h; u16 len; } ltag;
109 struct scatter_walk walk;
110 u32 len = req->assoclen;
111 u32 macp = 0;
112
113 /* prepend the AAD with a length tag */
114 if (len < 0xff00) {
115 ltag.l = cpu_to_be16(len);
116 ltag.len = 2;
117 } else {
118 ltag.l = cpu_to_be16(0xfffe);
119 put_unaligned_be32(len, &ltag.h);
120 ltag.len = 6;
121 }
122
123 ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
124 num_rounds(ctx));
125 scatterwalk_start(&walk, req->assoc);
126
127 do {
128 u32 n = scatterwalk_clamp(&walk, len);
129 u8 *p;
130
131 if (!n) {
132 scatterwalk_start(&walk, sg_next(walk.sg));
133 n = scatterwalk_clamp(&walk, len);
134 }
135 p = scatterwalk_map(&walk);
136 ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
137 num_rounds(ctx));
138 len -= n;
139
140 scatterwalk_unmap(p);
141 scatterwalk_advance(&walk, n);
142 scatterwalk_done(&walk, 0, len);
143 } while (len);
144}
145
146static int ccm_encrypt(struct aead_request *req)
147{
148 struct crypto_aead *aead = crypto_aead_reqtfm(req);
149 struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
150 struct blkcipher_desc desc = { .info = req->iv };
151 struct blkcipher_walk walk;
152 u8 __aligned(8) mac[AES_BLOCK_SIZE];
153 u8 buf[AES_BLOCK_SIZE];
154 u32 len = req->cryptlen;
155 int err;
156
157 err = ccm_init_mac(req, mac, len);
158 if (err)
159 return err;
160
161 kernel_neon_begin_partial(6);
162
163 if (req->assoclen)
164 ccm_calculate_auth_mac(req, mac);
165
166 /* preserve the original iv for the final round */
167 memcpy(buf, req->iv, AES_BLOCK_SIZE);
168
169 blkcipher_walk_init(&walk, req->dst, req->src, len);
170 err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
171 AES_BLOCK_SIZE);
172
173 while (walk.nbytes) {
174 u32 tail = walk.nbytes % AES_BLOCK_SIZE;
175
176 if (walk.nbytes == len)
177 tail = 0;
178
179 ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
180 walk.nbytes - tail, ctx->key_enc,
181 num_rounds(ctx), mac, walk.iv);
182
183 len -= walk.nbytes - tail;
184 err = blkcipher_walk_done(&desc, &walk, tail);
185 }
186 if (!err)
187 ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
188
189 kernel_neon_end();
190
191 if (err)
192 return err;
193
194 /* copy authtag to end of dst */
195 scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
196 crypto_aead_authsize(aead), 1);
197
198 return 0;
199}
200
201static int ccm_decrypt(struct aead_request *req)
202{
203 struct crypto_aead *aead = crypto_aead_reqtfm(req);
204 struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
205 unsigned int authsize = crypto_aead_authsize(aead);
206 struct blkcipher_desc desc = { .info = req->iv };
207 struct blkcipher_walk walk;
208 u8 __aligned(8) mac[AES_BLOCK_SIZE];
209 u8 buf[AES_BLOCK_SIZE];
210 u32 len = req->cryptlen - authsize;
211 int err;
212
213 err = ccm_init_mac(req, mac, len);
214 if (err)
215 return err;
216
217 kernel_neon_begin_partial(6);
218
219 if (req->assoclen)
220 ccm_calculate_auth_mac(req, mac);
221
222 /* preserve the original iv for the final round */
223 memcpy(buf, req->iv, AES_BLOCK_SIZE);
224
225 blkcipher_walk_init(&walk, req->dst, req->src, len);
226 err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
227 AES_BLOCK_SIZE);
228
229 while (walk.nbytes) {
230 u32 tail = walk.nbytes % AES_BLOCK_SIZE;
231
232 if (walk.nbytes == len)
233 tail = 0;
234
235 ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
236 walk.nbytes - tail, ctx->key_enc,
237 num_rounds(ctx), mac, walk.iv);
238
239 len -= walk.nbytes - tail;
240 err = blkcipher_walk_done(&desc, &walk, tail);
241 }
242 if (!err)
243 ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
244
245 kernel_neon_end();
246
247 if (err)
248 return err;
249
250 /* compare calculated auth tag with the stored one */
251 scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
252 authsize, 0);
253
254 if (memcmp(mac, buf, authsize))
255 return -EBADMSG;
256 return 0;
257}
258
259static struct crypto_alg ccm_aes_alg = {
260 .cra_name = "ccm(aes)",
261 .cra_driver_name = "ccm-aes-ce",
262 .cra_priority = 300,
263 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
264 .cra_blocksize = 1,
265 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
266 .cra_alignmask = 7,
267 .cra_type = &crypto_aead_type,
268 .cra_module = THIS_MODULE,
269 .cra_aead = {
270 .ivsize = AES_BLOCK_SIZE,
271 .maxauthsize = AES_BLOCK_SIZE,
272 .setkey = ccm_setkey,
273 .setauthsize = ccm_setauthsize,
274 .encrypt = ccm_encrypt,
275 .decrypt = ccm_decrypt,
276 }
277};
278
279static int __init aes_mod_init(void)
280{
281 if (!(elf_hwcap & HWCAP_AES))
282 return -ENODEV;
283 return crypto_register_alg(&ccm_aes_alg);
284}
285
286static void __exit aes_mod_exit(void)
287{
288 crypto_unregister_alg(&ccm_aes_alg);
289}
290
291module_init(aes_mod_init);
292module_exit(aes_mod_exit);
293
294MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
295MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
296MODULE_LICENSE("GPL v2");
297MODULE_ALIAS("ccm(aes)");
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c
new file mode 100644
index 000000000000..2075e1acae6b
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -0,0 +1,155 @@
1/*
2 * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <crypto/aes.h>
13#include <linux/cpufeature.h>
14#include <linux/crypto.h>
15#include <linux/module.h>
16
17MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
18MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
19MODULE_LICENSE("GPL v2");
20
21struct aes_block {
22 u8 b[AES_BLOCK_SIZE];
23};
24
25static int num_rounds(struct crypto_aes_ctx *ctx)
26{
27 /*
28 * # of rounds specified by AES:
29 * 128 bit key 10 rounds
30 * 192 bit key 12 rounds
31 * 256 bit key 14 rounds
32 * => n byte key => 6 + (n/4) rounds
33 */
34 return 6 + ctx->key_length / 4;
35}
36
37static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
38{
39 struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
40 struct aes_block *out = (struct aes_block *)dst;
41 struct aes_block const *in = (struct aes_block *)src;
42 void *dummy0;
43 int dummy1;
44
45 kernel_neon_begin_partial(4);
46
47 __asm__(" ld1 {v0.16b}, %[in] ;"
48 " ld1 {v1.2d}, [%[key]], #16 ;"
49 " cmp %w[rounds], #10 ;"
50 " bmi 0f ;"
51 " bne 3f ;"
52 " mov v3.16b, v1.16b ;"
53 " b 2f ;"
54 "0: mov v2.16b, v1.16b ;"
55 " ld1 {v3.2d}, [%[key]], #16 ;"
56 "1: aese v0.16b, v2.16b ;"
57 " aesmc v0.16b, v0.16b ;"
58 "2: ld1 {v1.2d}, [%[key]], #16 ;"
59 " aese v0.16b, v3.16b ;"
60 " aesmc v0.16b, v0.16b ;"
61 "3: ld1 {v2.2d}, [%[key]], #16 ;"
62 " subs %w[rounds], %w[rounds], #3 ;"
63 " aese v0.16b, v1.16b ;"
64 " aesmc v0.16b, v0.16b ;"
65 " ld1 {v3.2d}, [%[key]], #16 ;"
66 " bpl 1b ;"
67 " aese v0.16b, v2.16b ;"
68 " eor v0.16b, v0.16b, v3.16b ;"
69 " st1 {v0.16b}, %[out] ;"
70
71 : [out] "=Q"(*out),
72 [key] "=r"(dummy0),
73 [rounds] "=r"(dummy1)
74 : [in] "Q"(*in),
75 "1"(ctx->key_enc),
76 "2"(num_rounds(ctx) - 2)
77 : "cc");
78
79 kernel_neon_end();
80}
81
82static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
83{
84 struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
85 struct aes_block *out = (struct aes_block *)dst;
86 struct aes_block const *in = (struct aes_block *)src;
87 void *dummy0;
88 int dummy1;
89
90 kernel_neon_begin_partial(4);
91
92 __asm__(" ld1 {v0.16b}, %[in] ;"
93 " ld1 {v1.2d}, [%[key]], #16 ;"
94 " cmp %w[rounds], #10 ;"
95 " bmi 0f ;"
96 " bne 3f ;"
97 " mov v3.16b, v1.16b ;"
98 " b 2f ;"
99 "0: mov v2.16b, v1.16b ;"
100 " ld1 {v3.2d}, [%[key]], #16 ;"
101 "1: aesd v0.16b, v2.16b ;"
102 " aesimc v0.16b, v0.16b ;"
103 "2: ld1 {v1.2d}, [%[key]], #16 ;"
104 " aesd v0.16b, v3.16b ;"
105 " aesimc v0.16b, v0.16b ;"
106 "3: ld1 {v2.2d}, [%[key]], #16 ;"
107 " subs %w[rounds], %w[rounds], #3 ;"
108 " aesd v0.16b, v1.16b ;"
109 " aesimc v0.16b, v0.16b ;"
110 " ld1 {v3.2d}, [%[key]], #16 ;"
111 " bpl 1b ;"
112 " aesd v0.16b, v2.16b ;"
113 " eor v0.16b, v0.16b, v3.16b ;"
114 " st1 {v0.16b}, %[out] ;"
115
116 : [out] "=Q"(*out),
117 [key] "=r"(dummy0),
118 [rounds] "=r"(dummy1)
119 : [in] "Q"(*in),
120 "1"(ctx->key_dec),
121 "2"(num_rounds(ctx) - 2)
122 : "cc");
123
124 kernel_neon_end();
125}
126
127static struct crypto_alg aes_alg = {
128 .cra_name = "aes",
129 .cra_driver_name = "aes-ce",
130 .cra_priority = 300,
131 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
132 .cra_blocksize = AES_BLOCK_SIZE,
133 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
134 .cra_module = THIS_MODULE,
135 .cra_cipher = {
136 .cia_min_keysize = AES_MIN_KEY_SIZE,
137 .cia_max_keysize = AES_MAX_KEY_SIZE,
138 .cia_setkey = crypto_aes_set_key,
139 .cia_encrypt = aes_cipher_encrypt,
140 .cia_decrypt = aes_cipher_decrypt
141 }
142};
143
144static int __init aes_mod_init(void)
145{
146 return crypto_register_alg(&aes_alg);
147}
148
149static void __exit aes_mod_exit(void)
150{
151 crypto_unregister_alg(&aes_alg);
152}
153
154module_cpu_feature_match(AES, aes_mod_init);
155module_exit(aes_mod_exit);
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
new file mode 100644
index 000000000000..685a18f731eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,133 @@
1/*
2 * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
3 * Crypto Extensions
4 *
5 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13
14#define AES_ENTRY(func) ENTRY(ce_ ## func)
15#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
16
17 .arch armv8-a+crypto
18
19 /* preload all round keys */
20 .macro load_round_keys, rounds, rk
21 cmp \rounds, #12
22 blo 2222f /* 128 bits */
23 beq 1111f /* 192 bits */
24 ld1 {v17.16b-v18.16b}, [\rk], #32
251111: ld1 {v19.16b-v20.16b}, [\rk], #32
262222: ld1 {v21.16b-v24.16b}, [\rk], #64
27 ld1 {v25.16b-v28.16b}, [\rk], #64
28 ld1 {v29.16b-v31.16b}, [\rk]
29 .endm
30
31 /* prepare for encryption with key in rk[] */
32 .macro enc_prepare, rounds, rk, ignore
33 load_round_keys \rounds, \rk
34 .endm
35
36 /* prepare for encryption (again) but with new key in rk[] */
37 .macro enc_switch_key, rounds, rk, ignore
38 load_round_keys \rounds, \rk
39 .endm
40
41 /* prepare for decryption with key in rk[] */
42 .macro dec_prepare, rounds, rk, ignore
43 load_round_keys \rounds, \rk
44 .endm
45
46 .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
47 aes\de \i0\().16b, \k\().16b
48 .ifnb \i1
49 aes\de \i1\().16b, \k\().16b
50 .ifnb \i3
51 aes\de \i2\().16b, \k\().16b
52 aes\de \i3\().16b, \k\().16b
53 .endif
54 .endif
55 aes\mc \i0\().16b, \i0\().16b
56 .ifnb \i1
57 aes\mc \i1\().16b, \i1\().16b
58 .ifnb \i3
59 aes\mc \i2\().16b, \i2\().16b
60 aes\mc \i3\().16b, \i3\().16b
61 .endif
62 .endif
63 .endm
64
65 /* up to 4 interleaved encryption rounds with the same round key */
66 .macro round_Nx, enc, k, i0, i1, i2, i3
67 .ifc \enc, e
68 do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
69 .else
70 do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
71 .endif
72 .endm
73
74 /* up to 4 interleaved final rounds */
75 .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
76 aes\de \i0\().16b, \k\().16b
77 .ifnb \i1
78 aes\de \i1\().16b, \k\().16b
79 .ifnb \i3
80 aes\de \i2\().16b, \k\().16b
81 aes\de \i3\().16b, \k\().16b
82 .endif
83 .endif
84 eor \i0\().16b, \i0\().16b, \k2\().16b
85 .ifnb \i1
86 eor \i1\().16b, \i1\().16b, \k2\().16b
87 .ifnb \i3
88 eor \i2\().16b, \i2\().16b, \k2\().16b
89 eor \i3\().16b, \i3\().16b, \k2\().16b
90 .endif
91 .endif
92 .endm
93
94 /* up to 4 interleaved blocks */
95 .macro do_block_Nx, enc, rounds, i0, i1, i2, i3
96 cmp \rounds, #12
97 blo 2222f /* 128 bits */
98 beq 1111f /* 192 bits */
99 round_Nx \enc, v17, \i0, \i1, \i2, \i3
100 round_Nx \enc, v18, \i0, \i1, \i2, \i3
1011111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
102 round_Nx \enc, v20, \i0, \i1, \i2, \i3
1032222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
104 round_Nx \enc, \key, \i0, \i1, \i2, \i3
105 .endr
106 fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
107 .endm
108
109 .macro encrypt_block, in, rounds, t0, t1, t2
110 do_block_Nx e, \rounds, \in
111 .endm
112
113 .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
114 do_block_Nx e, \rounds, \i0, \i1
115 .endm
116
117 .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
118 do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
119 .endm
120
121 .macro decrypt_block, in, rounds, t0, t1, t2
122 do_block_Nx d, \rounds, \in
123 .endm
124
125 .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
126 do_block_Nx d, \rounds, \i0, \i1
127 .endm
128
129 .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
130 do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
131 .endm
132
133#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
new file mode 100644
index 000000000000..60f2f4c12256
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
1/*
2 * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/hwcap.h>
13#include <crypto/aes.h>
14#include <crypto/ablk_helper.h>
15#include <crypto/algapi.h>
16#include <linux/module.h>
17#include <linux/cpufeature.h>
18
19#ifdef USE_V8_CRYPTO_EXTENSIONS
20#define MODE "ce"
21#define PRIO 300
22#define aes_ecb_encrypt ce_aes_ecb_encrypt
23#define aes_ecb_decrypt ce_aes_ecb_decrypt
24#define aes_cbc_encrypt ce_aes_cbc_encrypt
25#define aes_cbc_decrypt ce_aes_cbc_decrypt
26#define aes_ctr_encrypt ce_aes_ctr_encrypt
27#define aes_xts_encrypt ce_aes_xts_encrypt
28#define aes_xts_decrypt ce_aes_xts_decrypt
29MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
30#else
31#define MODE "neon"
32#define PRIO 200
33#define aes_ecb_encrypt neon_aes_ecb_encrypt
34#define aes_ecb_decrypt neon_aes_ecb_decrypt
35#define aes_cbc_encrypt neon_aes_cbc_encrypt
36#define aes_cbc_decrypt neon_aes_cbc_decrypt
37#define aes_ctr_encrypt neon_aes_ctr_encrypt
38#define aes_xts_encrypt neon_aes_xts_encrypt
39#define aes_xts_decrypt neon_aes_xts_decrypt
40MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
41MODULE_ALIAS("ecb(aes)");
42MODULE_ALIAS("cbc(aes)");
43MODULE_ALIAS("ctr(aes)");
44MODULE_ALIAS("xts(aes)");
45#endif
46
47MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
48MODULE_LICENSE("GPL v2");
49
50/* defined in aes-modes.S */
51asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
52 int rounds, int blocks, int first);
53asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
54 int rounds, int blocks, int first);
55
56asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
57 int rounds, int blocks, u8 iv[], int first);
58asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
59 int rounds, int blocks, u8 iv[], int first);
60
61asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
62 int rounds, int blocks, u8 ctr[], int first);
63
64asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
65 int rounds, int blocks, u8 const rk2[], u8 iv[],
66 int first);
67asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
68 int rounds, int blocks, u8 const rk2[], u8 iv[],
69 int first);
70
71struct crypto_aes_xts_ctx {
72 struct crypto_aes_ctx key1;
73 struct crypto_aes_ctx __aligned(8) key2;
74};
75
76static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
77 unsigned int key_len)
78{
79 struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
80 int ret;
81
82 ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
83 if (!ret)
84 ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
85 key_len / 2);
86 if (!ret)
87 return 0;
88
89 tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
90 return -EINVAL;
91}
92
93static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
94 struct scatterlist *src, unsigned int nbytes)
95{
96 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
97 int err, first, rounds = 6 + ctx->key_length / 4;
98 struct blkcipher_walk walk;
99 unsigned int blocks;
100
101 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
102 blkcipher_walk_init(&walk, dst, src, nbytes);
103 err = blkcipher_walk_virt(desc, &walk);
104
105 kernel_neon_begin();
106 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
107 aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
108 (u8 *)ctx->key_enc, rounds, blocks, first);
109 err = blkcipher_walk_done(desc, &walk, 0);
110 }
111 kernel_neon_end();
112 return err;
113}
114
115static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
116 struct scatterlist *src, unsigned int nbytes)
117{
118 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
119 int err, first, rounds = 6 + ctx->key_length / 4;
120 struct blkcipher_walk walk;
121 unsigned int blocks;
122
123 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
124 blkcipher_walk_init(&walk, dst, src, nbytes);
125 err = blkcipher_walk_virt(desc, &walk);
126
127 kernel_neon_begin();
128 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
129 aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
130 (u8 *)ctx->key_dec, rounds, blocks, first);
131 err = blkcipher_walk_done(desc, &walk, 0);
132 }
133 kernel_neon_end();
134 return err;
135}
136
137static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
138 struct scatterlist *src, unsigned int nbytes)
139{
140 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
141 int err, first, rounds = 6 + ctx->key_length / 4;
142 struct blkcipher_walk walk;
143 unsigned int blocks;
144
145 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
146 blkcipher_walk_init(&walk, dst, src, nbytes);
147 err = blkcipher_walk_virt(desc, &walk);
148
149 kernel_neon_begin();
150 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
151 aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
152 (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
153 first);
154 err = blkcipher_walk_done(desc, &walk, 0);
155 }
156 kernel_neon_end();
157 return err;
158}
159
160static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
161 struct scatterlist *src, unsigned int nbytes)
162{
163 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
164 int err, first, rounds = 6 + ctx->key_length / 4;
165 struct blkcipher_walk walk;
166 unsigned int blocks;
167
168 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
169 blkcipher_walk_init(&walk, dst, src, nbytes);
170 err = blkcipher_walk_virt(desc, &walk);
171
172 kernel_neon_begin();
173 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
174 aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
175 (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
176 first);
177 err = blkcipher_walk_done(desc, &walk, 0);
178 }
179 kernel_neon_end();
180 return err;
181}
182
183static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
184 struct scatterlist *src, unsigned int nbytes)
185{
186 struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
187 int err, first, rounds = 6 + ctx->key_length / 4;
188 struct blkcipher_walk walk;
189 int blocks;
190
191 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
192 blkcipher_walk_init(&walk, dst, src, nbytes);
193 err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
194
195 first = 1;
196 kernel_neon_begin();
197 while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
198 aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
199 (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
200 first);
201 first = 0;
202 nbytes -= blocks * AES_BLOCK_SIZE;
203 if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
204 break;
205 err = blkcipher_walk_done(desc, &walk,
206 walk.nbytes % AES_BLOCK_SIZE);
207 }
208 if (nbytes) {
209 u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
210 u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
211 u8 __aligned(8) tail[AES_BLOCK_SIZE];
212
213 /*
214 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
215 * to tell aes_ctr_encrypt() to only read half a block.
216 */
217 blocks = (nbytes <= 8) ? -1 : 1;
218
219 aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
220 blocks, walk.iv, first);
221 memcpy(tdst, tail, nbytes);
222 err = blkcipher_walk_done(desc, &walk, 0);
223 }
224 kernel_neon_end();
225
226 return err;
227}
228
229static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
230 struct scatterlist *src, unsigned int nbytes)
231{
232 struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
233 int err, first, rounds = 6 + ctx->key1.key_length / 4;
234 struct blkcipher_walk walk;
235 unsigned int blocks;
236
237 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
238 blkcipher_walk_init(&walk, dst, src, nbytes);
239 err = blkcipher_walk_virt(desc, &walk);
240
241 kernel_neon_begin();
242 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
243 aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
244 (u8 *)ctx->key1.key_enc, rounds, blocks,
245 (u8 *)ctx->key2.key_enc, walk.iv, first);
246 err = blkcipher_walk_done(desc, &walk, 0);
247 }
248 kernel_neon_end();
249
250 return err;
251}
252
253static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
254 struct scatterlist *src, unsigned int nbytes)
255{
256 struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
257 int err, first, rounds = 6 + ctx->key1.key_length / 4;
258 struct blkcipher_walk walk;
259 unsigned int blocks;
260
261 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
262 blkcipher_walk_init(&walk, dst, src, nbytes);
263 err = blkcipher_walk_virt(desc, &walk);
264
265 kernel_neon_begin();
266 for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
267 aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
268 (u8 *)ctx->key1.key_dec, rounds, blocks,
269 (u8 *)ctx->key2.key_enc, walk.iv, first);
270 err = blkcipher_walk_done(desc, &walk, 0);
271 }
272 kernel_neon_end();
273
274 return err;
275}
276
277static struct crypto_alg aes_algs[] = { {
278 .cra_name = "__ecb-aes-" MODE,
279 .cra_driver_name = "__driver-ecb-aes-" MODE,
280 .cra_priority = 0,
281 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
282 .cra_blocksize = AES_BLOCK_SIZE,
283 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
284 .cra_alignmask = 7,
285 .cra_type = &crypto_blkcipher_type,
286 .cra_module = THIS_MODULE,
287 .cra_blkcipher = {
288 .min_keysize = AES_MIN_KEY_SIZE,
289 .max_keysize = AES_MAX_KEY_SIZE,
290 .ivsize = AES_BLOCK_SIZE,
291 .setkey = crypto_aes_set_key,
292 .encrypt = ecb_encrypt,
293 .decrypt = ecb_decrypt,
294 },
295}, {
296 .cra_name = "__cbc-aes-" MODE,
297 .cra_driver_name = "__driver-cbc-aes-" MODE,
298 .cra_priority = 0,
299 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
300 .cra_blocksize = AES_BLOCK_SIZE,
301 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
302 .cra_alignmask = 7,
303 .cra_type = &crypto_blkcipher_type,
304 .cra_module = THIS_MODULE,
305 .cra_blkcipher = {
306 .min_keysize = AES_MIN_KEY_SIZE,
307 .max_keysize = AES_MAX_KEY_SIZE,
308 .ivsize = AES_BLOCK_SIZE,
309 .setkey = crypto_aes_set_key,
310 .encrypt = cbc_encrypt,
311 .decrypt = cbc_decrypt,
312 },
313}, {
314 .cra_name = "__ctr-aes-" MODE,
315 .cra_driver_name = "__driver-ctr-aes-" MODE,
316 .cra_priority = 0,
317 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
318 .cra_blocksize = 1,
319 .cra_ctxsize = sizeof(struct crypto_aes_ctx),
320 .cra_alignmask = 7,
321 .cra_type = &crypto_blkcipher_type,
322 .cra_module = THIS_MODULE,
323 .cra_blkcipher = {
324 .min_keysize = AES_MIN_KEY_SIZE,
325 .max_keysize = AES_MAX_KEY_SIZE,
326 .ivsize = AES_BLOCK_SIZE,
327 .setkey = crypto_aes_set_key,
328 .encrypt = ctr_encrypt,
329 .decrypt = ctr_encrypt,
330 },
331}, {
332 .cra_name = "__xts-aes-" MODE,
333 .cra_driver_name = "__driver-xts-aes-" MODE,
334 .cra_priority = 0,
335 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
336 .cra_blocksize = AES_BLOCK_SIZE,
337 .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
338 .cra_alignmask = 7,
339 .cra_type = &crypto_blkcipher_type,
340 .cra_module = THIS_MODULE,
341 .cra_blkcipher = {
342 .min_keysize = 2 * AES_MIN_KEY_SIZE,
343 .max_keysize = 2 * AES_MAX_KEY_SIZE,
344 .ivsize = AES_BLOCK_SIZE,
345 .setkey = xts_set_key,
346 .encrypt = xts_encrypt,
347 .decrypt = xts_decrypt,
348 },
349}, {
350 .cra_name = "ecb(aes)",
351 .cra_driver_name = "ecb-aes-" MODE,
352 .cra_priority = PRIO,
353 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
354 .cra_blocksize = AES_BLOCK_SIZE,
355 .cra_ctxsize = sizeof(struct async_helper_ctx),
356 .cra_alignmask = 7,
357 .cra_type = &crypto_ablkcipher_type,
358 .cra_module = THIS_MODULE,
359 .cra_init = ablk_init,
360 .cra_exit = ablk_exit,
361 .cra_ablkcipher = {
362 .min_keysize = AES_MIN_KEY_SIZE,
363 .max_keysize = AES_MAX_KEY_SIZE,
364 .ivsize = AES_BLOCK_SIZE,
365 .setkey = ablk_set_key,
366 .encrypt = ablk_encrypt,
367 .decrypt = ablk_decrypt,
368 }
369}, {
370 .cra_name = "cbc(aes)",
371 .cra_driver_name = "cbc-aes-" MODE,
372 .cra_priority = PRIO,
373 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
374 .cra_blocksize = AES_BLOCK_SIZE,
375 .cra_ctxsize = sizeof(struct async_helper_ctx),
376 .cra_alignmask = 7,
377 .cra_type = &crypto_ablkcipher_type,
378 .cra_module = THIS_MODULE,
379 .cra_init = ablk_init,
380 .cra_exit = ablk_exit,
381 .cra_ablkcipher = {
382 .min_keysize = AES_MIN_KEY_SIZE,
383 .max_keysize = AES_MAX_KEY_SIZE,
384 .ivsize = AES_BLOCK_SIZE,
385 .setkey = ablk_set_key,
386 .encrypt = ablk_encrypt,
387 .decrypt = ablk_decrypt,
388 }
389}, {
390 .cra_name = "ctr(aes)",
391 .cra_driver_name = "ctr-aes-" MODE,
392 .cra_priority = PRIO,
393 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
394 .cra_blocksize = 1,
395 .cra_ctxsize = sizeof(struct async_helper_ctx),
396 .cra_alignmask = 7,
397 .cra_type = &crypto_ablkcipher_type,
398 .cra_module = THIS_MODULE,
399 .cra_init = ablk_init,
400 .cra_exit = ablk_exit,
401 .cra_ablkcipher = {
402 .min_keysize = AES_MIN_KEY_SIZE,
403 .max_keysize = AES_MAX_KEY_SIZE,
404 .ivsize = AES_BLOCK_SIZE,
405 .setkey = ablk_set_key,
406 .encrypt = ablk_encrypt,
407 .decrypt = ablk_decrypt,
408 }
409}, {
410 .cra_name = "xts(aes)",
411 .cra_driver_name = "xts-aes-" MODE,
412 .cra_priority = PRIO,
413 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
414 .cra_blocksize = AES_BLOCK_SIZE,
415 .cra_ctxsize = sizeof(struct async_helper_ctx),
416 .cra_alignmask = 7,
417 .cra_type = &crypto_ablkcipher_type,
418 .cra_module = THIS_MODULE,
419 .cra_init = ablk_init,
420 .cra_exit = ablk_exit,
421 .cra_ablkcipher = {
422 .min_keysize = 2 * AES_MIN_KEY_SIZE,
423 .max_keysize = 2 * AES_MAX_KEY_SIZE,
424 .ivsize = AES_BLOCK_SIZE,
425 .setkey = ablk_set_key,
426 .encrypt = ablk_encrypt,
427 .decrypt = ablk_decrypt,
428 }
429} };
430
431static int __init aes_init(void)
432{
433 return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
434}
435
436static void __exit aes_exit(void)
437{
438 crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
439}
440
441#ifdef USE_V8_CRYPTO_EXTENSIONS
442module_cpu_feature_match(AES, aes_init);
443#else
444module_init(aes_init);
445#endif
446module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
new file mode 100644
index 000000000000..f6e372c528eb
--- /dev/null
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,532 @@
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13 .text
14 .align 4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare - setup NEON registers for encryption
26 * - dec_prepare - setup NEON registers for decryption
27 * - enc_switch_key - change to new key after having prepared for encryption
28 * - encrypt_block - encrypt a single block
29 * - decrypt block - decrypt a single block
30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43 encrypt_block2x v0, v1, w3, x2, x6, w7
44 ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48 decrypt_block2x v0, v1, w3, x2, x6, w7
49 ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56 ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61 ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68 .macro do_encrypt_block2x
69 bl aes_encrypt_block2x
70 .endm
71
72 .macro do_decrypt_block2x
73 bl aes_decrypt_block2x
74 .endm
75
76 .macro do_encrypt_block4x
77 bl aes_encrypt_block4x
78 .endm
79
80 .macro do_decrypt_block4x
81 bl aes_decrypt_block4x
82 .endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88 .macro do_encrypt_block2x
89 encrypt_block2x v0, v1, w3, x2, x6, w7
90 .endm
91
92 .macro do_decrypt_block2x
93 decrypt_block2x v0, v1, w3, x2, x6, w7
94 .endm
95
96 .macro do_encrypt_block4x
97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98 .endm
99
100 .macro do_decrypt_block4x
101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102 .endm
103
104#endif
105
106 /*
107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108 * int blocks, int first)
109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110 * int blocks, int first)
111 */
112
113AES_ENTRY(aes_ecb_encrypt)
114 FRAME_PUSH
115 cbz w5, .LecbencloopNx
116
117 enc_prepare w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121 subs w4, w4, #INTERLEAVE
122 bmi .Lecbenc1x
123#if INTERLEAVE == 2
124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
125 do_encrypt_block2x
126 st1 {v0.16b-v1.16b}, [x0], #32
127#else
128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
129 do_encrypt_block4x
130 st1 {v0.16b-v3.16b}, [x0], #64
131#endif
132 b .LecbencloopNx
133.Lecbenc1x:
134 adds w4, w4, #INTERLEAVE
135 beq .Lecbencout
136#endif
137.Lecbencloop:
138 ld1 {v0.16b}, [x1], #16 /* get next pt block */
139 encrypt_block v0, w3, x2, x5, w6
140 st1 {v0.16b}, [x0], #16
141 subs w4, w4, #1
142 bne .Lecbencloop
143.Lecbencout:
144 FRAME_POP
145 ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150 FRAME_PUSH
151 cbz w5, .LecbdecloopNx
152
153 dec_prepare w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157 subs w4, w4, #INTERLEAVE
158 bmi .Lecbdec1x
159#if INTERLEAVE == 2
160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
161 do_decrypt_block2x
162 st1 {v0.16b-v1.16b}, [x0], #32
163#else
164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
165 do_decrypt_block4x
166 st1 {v0.16b-v3.16b}, [x0], #64
167#endif
168 b .LecbdecloopNx
169.Lecbdec1x:
170 adds w4, w4, #INTERLEAVE
171 beq .Lecbdecout
172#endif
173.Lecbdecloop:
174 ld1 {v0.16b}, [x1], #16 /* get next ct block */
175 decrypt_block v0, w3, x2, x5, w6
176 st1 {v0.16b}, [x0], #16
177 subs w4, w4, #1
178 bne .Lecbdecloop
179.Lecbdecout:
180 FRAME_POP
181 ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185 /*
186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187 * int blocks, u8 iv[], int first)
188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189 * int blocks, u8 iv[], int first)
190 */
191
192AES_ENTRY(aes_cbc_encrypt)
193 cbz w6, .Lcbcencloop
194
195 ld1 {v0.16b}, [x5] /* get iv */
196 enc_prepare w3, x2, x5
197
198.Lcbcencloop:
199 ld1 {v1.16b}, [x1], #16 /* get next pt block */
200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
201 encrypt_block v0, w3, x2, x5, w6
202 st1 {v0.16b}, [x0], #16
203 subs w4, w4, #1
204 bne .Lcbcencloop
205 ret
206AES_ENDPROC(aes_cbc_encrypt)
207
208
209AES_ENTRY(aes_cbc_decrypt)
210 FRAME_PUSH
211 cbz w6, .LcbcdecloopNx
212
213 ld1 {v7.16b}, [x5] /* get iv */
214 dec_prepare w3, x2, x5
215
216.LcbcdecloopNx:
217#if INTERLEAVE >= 2
218 subs w4, w4, #INTERLEAVE
219 bmi .Lcbcdec1x
220#if INTERLEAVE == 2
221 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
222 mov v2.16b, v0.16b
223 mov v3.16b, v1.16b
224 do_decrypt_block2x
225 eor v0.16b, v0.16b, v7.16b
226 eor v1.16b, v1.16b, v2.16b
227 mov v7.16b, v3.16b
228 st1 {v0.16b-v1.16b}, [x0], #32
229#else
230 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
231 mov v4.16b, v0.16b
232 mov v5.16b, v1.16b
233 mov v6.16b, v2.16b
234 do_decrypt_block4x
235 sub x1, x1, #16
236 eor v0.16b, v0.16b, v7.16b
237 eor v1.16b, v1.16b, v4.16b
238 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
239 eor v2.16b, v2.16b, v5.16b
240 eor v3.16b, v3.16b, v6.16b
241 st1 {v0.16b-v3.16b}, [x0], #64
242#endif
243 b .LcbcdecloopNx
244.Lcbcdec1x:
245 adds w4, w4, #INTERLEAVE
246 beq .Lcbcdecout
247#endif
248.Lcbcdecloop:
249 ld1 {v1.16b}, [x1], #16 /* get next ct block */
250 mov v0.16b, v1.16b /* ...and copy to v0 */
251 decrypt_block v0, w3, x2, x5, w6
252 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
253 mov v7.16b, v1.16b /* ct is next iv */
254 st1 {v0.16b}, [x0], #16
255 subs w4, w4, #1
256 bne .Lcbcdecloop
257.Lcbcdecout:
258 FRAME_POP
259 ret
260AES_ENDPROC(aes_cbc_decrypt)
261
262
263 /*
264 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
265 * int blocks, u8 ctr[], int first)
266 */
267
268AES_ENTRY(aes_ctr_encrypt)
269 FRAME_PUSH
270 cbnz w6, .Lctrfirst /* 1st time around? */
271 umov x5, v4.d[1] /* keep swabbed ctr in reg */
272 rev x5, x5
273#if INTERLEAVE >= 2
274 cmn w5, w4 /* 32 bit overflow? */
275 bcs .Lctrinc
276 add x5, x5, #1 /* increment BE ctr */
277 b .LctrincNx
278#else
279 b .Lctrinc
280#endif
281.Lctrfirst:
282 enc_prepare w3, x2, x6
283 ld1 {v4.16b}, [x5]
284 umov x5, v4.d[1] /* keep swabbed ctr in reg */
285 rev x5, x5
286#if INTERLEAVE >= 2
287 cmn w5, w4 /* 32 bit overflow? */
288 bcs .Lctrloop
289.LctrloopNx:
290 subs w4, w4, #INTERLEAVE
291 bmi .Lctr1x
292#if INTERLEAVE == 2
293 mov v0.8b, v4.8b
294 mov v1.8b, v4.8b
295 rev x7, x5
296 add x5, x5, #1
297 ins v0.d[1], x7
298 rev x7, x5
299 add x5, x5, #1
300 ins v1.d[1], x7
301 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
302 do_encrypt_block2x
303 eor v0.16b, v0.16b, v2.16b
304 eor v1.16b, v1.16b, v3.16b
305 st1 {v0.16b-v1.16b}, [x0], #32
306#else
307 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
308 dup v7.4s, w5
309 mov v0.16b, v4.16b
310 add v7.4s, v7.4s, v8.4s
311 mov v1.16b, v4.16b
312 rev32 v8.16b, v7.16b
313 mov v2.16b, v4.16b
314 mov v3.16b, v4.16b
315 mov v1.s[3], v8.s[0]
316 mov v2.s[3], v8.s[1]
317 mov v3.s[3], v8.s[2]
318 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
319 do_encrypt_block4x
320 eor v0.16b, v5.16b, v0.16b
321 ld1 {v5.16b}, [x1], #16 /* get 1 input block */
322 eor v1.16b, v6.16b, v1.16b
323 eor v2.16b, v7.16b, v2.16b
324 eor v3.16b, v5.16b, v3.16b
325 st1 {v0.16b-v3.16b}, [x0], #64
326 add x5, x5, #INTERLEAVE
327#endif
328 cbz w4, .LctroutNx
329.LctrincNx:
330 rev x7, x5
331 ins v4.d[1], x7
332 b .LctrloopNx
333.LctroutNx:
334 sub x5, x5, #1
335 rev x7, x5
336 ins v4.d[1], x7
337 b .Lctrout
338.Lctr1x:
339 adds w4, w4, #INTERLEAVE
340 beq .Lctrout
341#endif
342.Lctrloop:
343 mov v0.16b, v4.16b
344 encrypt_block v0, w3, x2, x6, w7
345 subs w4, w4, #1
346 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
347 ld1 {v3.16b}, [x1], #16
348 eor v3.16b, v0.16b, v3.16b
349 st1 {v3.16b}, [x0], #16
350 beq .Lctrout
351.Lctrinc:
352 adds x5, x5, #1 /* increment BE ctr */
353 rev x7, x5
354 ins v4.d[1], x7
355 bcc .Lctrloop /* no overflow? */
356 umov x7, v4.d[0] /* load upper word of ctr */
357 rev x7, x7 /* ... to handle the carry */
358 add x7, x7, #1
359 rev x7, x7
360 ins v4.d[0], x7
361 b .Lctrloop
362.Lctrhalfblock:
363 ld1 {v3.8b}, [x1]
364 eor v3.8b, v0.8b, v3.8b
365 st1 {v3.8b}, [x0]
366.Lctrout:
367 FRAME_POP
368 ret
369AES_ENDPROC(aes_ctr_encrypt)
370 .ltorg
371
372
373 /*
374 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
375 * int blocks, u8 const rk2[], u8 iv[], int first)
376 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
377 * int blocks, u8 const rk2[], u8 iv[], int first)
378 */
379
380 .macro next_tweak, out, in, const, tmp
381 sshr \tmp\().2d, \in\().2d, #63
382 and \tmp\().16b, \tmp\().16b, \const\().16b
383 add \out\().2d, \in\().2d, \in\().2d
384 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
385 eor \out\().16b, \out\().16b, \tmp\().16b
386 .endm
387
388.Lxts_mul_x:
389 .word 1, 0, 0x87, 0
390
391AES_ENTRY(aes_xts_encrypt)
392 FRAME_PUSH
393 cbz w7, .LxtsencloopNx
394
395 ld1 {v4.16b}, [x6]
396 enc_prepare w3, x5, x6
397 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
398 enc_switch_key w3, x2, x6
399 ldr q7, .Lxts_mul_x
400 b .LxtsencNx
401
402.LxtsencloopNx:
403 ldr q7, .Lxts_mul_x
404 next_tweak v4, v4, v7, v8
405.LxtsencNx:
406#if INTERLEAVE >= 2
407 subs w4, w4, #INTERLEAVE
408 bmi .Lxtsenc1x
409#if INTERLEAVE == 2
410 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
411 next_tweak v5, v4, v7, v8
412 eor v0.16b, v0.16b, v4.16b
413 eor v1.16b, v1.16b, v5.16b
414 do_encrypt_block2x
415 eor v0.16b, v0.16b, v4.16b
416 eor v1.16b, v1.16b, v5.16b
417 st1 {v0.16b-v1.16b}, [x0], #32
418 cbz w4, .LxtsencoutNx
419 next_tweak v4, v5, v7, v8
420 b .LxtsencNx
421.LxtsencoutNx:
422 mov v4.16b, v5.16b
423 b .Lxtsencout
424#else
425 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
426 next_tweak v5, v4, v7, v8
427 eor v0.16b, v0.16b, v4.16b
428 next_tweak v6, v5, v7, v8
429 eor v1.16b, v1.16b, v5.16b
430 eor v2.16b, v2.16b, v6.16b
431 next_tweak v7, v6, v7, v8
432 eor v3.16b, v3.16b, v7.16b
433 do_encrypt_block4x
434 eor v3.16b, v3.16b, v7.16b
435 eor v0.16b, v0.16b, v4.16b
436 eor v1.16b, v1.16b, v5.16b
437 eor v2.16b, v2.16b, v6.16b
438 st1 {v0.16b-v3.16b}, [x0], #64
439 mov v4.16b, v7.16b
440 cbz w4, .Lxtsencout
441 b .LxtsencloopNx
442#endif
443.Lxtsenc1x:
444 adds w4, w4, #INTERLEAVE
445 beq .Lxtsencout
446#endif
447.Lxtsencloop:
448 ld1 {v1.16b}, [x1], #16
449 eor v0.16b, v1.16b, v4.16b
450 encrypt_block v0, w3, x2, x6, w7
451 eor v0.16b, v0.16b, v4.16b
452 st1 {v0.16b}, [x0], #16
453 subs w4, w4, #1
454 beq .Lxtsencout
455 next_tweak v4, v4, v7, v8
456 b .Lxtsencloop
457.Lxtsencout:
458 FRAME_POP
459 ret
460AES_ENDPROC(aes_xts_encrypt)
461
462
463AES_ENTRY(aes_xts_decrypt)
464 FRAME_PUSH
465 cbz w7, .LxtsdecloopNx
466
467 ld1 {v4.16b}, [x6]
468 enc_prepare w3, x5, x6
469 encrypt_block v4, w3, x5, x6, w7 /* first tweak */
470 dec_prepare w3, x2, x6
471 ldr q7, .Lxts_mul_x
472 b .LxtsdecNx
473
474.LxtsdecloopNx:
475 ldr q7, .Lxts_mul_x
476 next_tweak v4, v4, v7, v8
477.LxtsdecNx:
478#if INTERLEAVE >= 2
479 subs w4, w4, #INTERLEAVE
480 bmi .Lxtsdec1x
481#if INTERLEAVE == 2
482 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
483 next_tweak v5, v4, v7, v8
484 eor v0.16b, v0.16b, v4.16b
485 eor v1.16b, v1.16b, v5.16b
486 do_decrypt_block2x
487 eor v0.16b, v0.16b, v4.16b
488 eor v1.16b, v1.16b, v5.16b
489 st1 {v0.16b-v1.16b}, [x0], #32
490 cbz w4, .LxtsdecoutNx
491 next_tweak v4, v5, v7, v8
492 b .LxtsdecNx
493.LxtsdecoutNx:
494 mov v4.16b, v5.16b
495 b .Lxtsdecout
496#else
497 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
498 next_tweak v5, v4, v7, v8
499 eor v0.16b, v0.16b, v4.16b
500 next_tweak v6, v5, v7, v8
501 eor v1.16b, v1.16b, v5.16b
502 eor v2.16b, v2.16b, v6.16b
503 next_tweak v7, v6, v7, v8
504 eor v3.16b, v3.16b, v7.16b
505 do_decrypt_block4x
506 eor v3.16b, v3.16b, v7.16b
507 eor v0.16b, v0.16b, v4.16b
508 eor v1.16b, v1.16b, v5.16b
509 eor v2.16b, v2.16b, v6.16b
510 st1 {v0.16b-v3.16b}, [x0], #64
511 mov v4.16b, v7.16b
512 cbz w4, .Lxtsdecout
513 b .LxtsdecloopNx
514#endif
515.Lxtsdec1x:
516 adds w4, w4, #INTERLEAVE
517 beq .Lxtsdecout
518#endif
519.Lxtsdecloop:
520 ld1 {v1.16b}, [x1], #16
521 eor v0.16b, v1.16b, v4.16b
522 decrypt_block v0, w3, x2, x6, w7
523 eor v0.16b, v0.16b, v4.16b
524 st1 {v0.16b}, [x0], #16
525 subs w4, w4, #1
526 beq .Lxtsdecout
527 next_tweak v4, v4, v7, v8
528 b .Lxtsdecloop
529.Lxtsdecout:
530 FRAME_POP
531 ret
532AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
new file mode 100644
index 000000000000..b93170e1cc93
--- /dev/null
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
1/*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12
13#define AES_ENTRY(func) ENTRY(neon_ ## func)
14#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
15
16 /* multiply by polynomial 'x' in GF(2^8) */
17 .macro mul_by_x, out, in, temp, const
18 sshr \temp, \in, #7
19 add \out, \in, \in
20 and \temp, \temp, \const
21 eor \out, \out, \temp
22 .endm
23
24 /* preload the entire Sbox */
25 .macro prepare, sbox, shiftrows, temp
26 adr \temp, \sbox
27 movi v12.16b, #0x40
28 ldr q13, \shiftrows
29 movi v14.16b, #0x1b
30 ld1 {v16.16b-v19.16b}, [\temp], #64
31 ld1 {v20.16b-v23.16b}, [\temp], #64
32 ld1 {v24.16b-v27.16b}, [\temp], #64
33 ld1 {v28.16b-v31.16b}, [\temp]
34 .endm
35
36 /* do preload for encryption */
37 .macro enc_prepare, ignore0, ignore1, temp
38 prepare .LForward_Sbox, .LForward_ShiftRows, \temp
39 .endm
40
41 .macro enc_switch_key, ignore0, ignore1, temp
42 /* do nothing */
43 .endm
44
45 /* do preload for decryption */
46 .macro dec_prepare, ignore0, ignore1, temp
47 prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
48 .endm
49
50 /* apply SubBytes transformation using the the preloaded Sbox */
51 .macro sub_bytes, in
52 sub v9.16b, \in\().16b, v12.16b
53 tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
54 sub v10.16b, v9.16b, v12.16b
55 tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
56 sub v11.16b, v10.16b, v12.16b
57 tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
58 tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
59 .endm
60
61 /* apply MixColumns transformation */
62 .macro mix_columns, in
63 mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b
64 rev32 v8.8h, \in\().8h
65 eor \in\().16b, v10.16b, \in\().16b
66 shl v9.4s, v8.4s, #24
67 shl v11.4s, \in\().4s, #24
68 sri v9.4s, v8.4s, #8
69 sri v11.4s, \in\().4s, #8
70 eor v9.16b, v9.16b, v8.16b
71 eor v10.16b, v10.16b, v9.16b
72 eor \in\().16b, v10.16b, v11.16b
73 .endm
74
75 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
76 .macro inv_mix_columns, in
77 mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b
78 mul_by_x v11.16b, v11.16b, v10.16b, v14.16b
79 eor \in\().16b, \in\().16b, v11.16b
80 rev32 v11.8h, v11.8h
81 eor \in\().16b, \in\().16b, v11.16b
82 mix_columns \in
83 .endm
84
85 .macro do_block, enc, in, rounds, rk, rkp, i
86 ld1 {v15.16b}, [\rk]
87 add \rkp, \rk, #16
88 mov \i, \rounds
891111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
90 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
91 sub_bytes \in
92 ld1 {v15.16b}, [\rkp], #16
93 subs \i, \i, #1
94 beq 2222f
95 .if \enc == 1
96 mix_columns \in
97 .else
98 inv_mix_columns \in
99 .endif
100 b 1111b
1012222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
102 .endm
103
104 .macro encrypt_block, in, rounds, rk, rkp, i
105 do_block 1, \in, \rounds, \rk, \rkp, \i
106 .endm
107
108 .macro decrypt_block, in, rounds, rk, rkp, i
109 do_block 0, \in, \rounds, \rk, \rkp, \i
110 .endm
111
112 /*
113 * Interleaved versions: functionally equivalent to the
114 * ones above, but applied to 2 or 4 AES states in parallel.
115 */
116
117 .macro sub_bytes_2x, in0, in1
118 sub v8.16b, \in0\().16b, v12.16b
119 sub v9.16b, \in1\().16b, v12.16b
120 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
121 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
122 sub v10.16b, v8.16b, v12.16b
123 sub v11.16b, v9.16b, v12.16b
124 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
125 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
126 sub v8.16b, v10.16b, v12.16b
127 sub v9.16b, v11.16b, v12.16b
128 tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
129 tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
130 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
131 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
132 .endm
133
134 .macro sub_bytes_4x, in0, in1, in2, in3
135 sub v8.16b, \in0\().16b, v12.16b
136 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
137 sub v9.16b, \in1\().16b, v12.16b
138 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
139 sub v10.16b, \in2\().16b, v12.16b
140 tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
141 sub v11.16b, \in3\().16b, v12.16b
142 tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
143 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
144 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
145 sub v8.16b, v8.16b, v12.16b
146 tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
147 sub v9.16b, v9.16b, v12.16b
148 tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
149 sub v10.16b, v10.16b, v12.16b
150 tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
151 sub v11.16b, v11.16b, v12.16b
152 tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
153 sub v8.16b, v8.16b, v12.16b
154 tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
155 sub v9.16b, v9.16b, v12.16b
156 tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
157 sub v10.16b, v10.16b, v12.16b
158 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
159 sub v11.16b, v11.16b, v12.16b
160 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
161 tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
162 tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
163 .endm
164
165 .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
166 sshr \tmp0\().16b, \in0\().16b, #7
167 add \out0\().16b, \in0\().16b, \in0\().16b
168 sshr \tmp1\().16b, \in1\().16b, #7
169 and \tmp0\().16b, \tmp0\().16b, \const\().16b
170 add \out1\().16b, \in1\().16b, \in1\().16b
171 and \tmp1\().16b, \tmp1\().16b, \const\().16b
172 eor \out0\().16b, \out0\().16b, \tmp0\().16b
173 eor \out1\().16b, \out1\().16b, \tmp1\().16b
174 .endm
175
176 .macro mix_columns_2x, in0, in1
177 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
178 rev32 v10.8h, \in0\().8h
179 rev32 v11.8h, \in1\().8h
180 eor \in0\().16b, v8.16b, \in0\().16b
181 eor \in1\().16b, v9.16b, \in1\().16b
182 shl v12.4s, v10.4s, #24
183 shl v13.4s, v11.4s, #24
184 eor v8.16b, v8.16b, v10.16b
185 sri v12.4s, v10.4s, #8
186 shl v10.4s, \in0\().4s, #24
187 eor v9.16b, v9.16b, v11.16b
188 sri v13.4s, v11.4s, #8
189 shl v11.4s, \in1\().4s, #24
190 sri v10.4s, \in0\().4s, #8
191 eor \in0\().16b, v8.16b, v12.16b
192 sri v11.4s, \in1\().4s, #8
193 eor \in1\().16b, v9.16b, v13.16b
194 eor \in0\().16b, v10.16b, \in0\().16b
195 eor \in1\().16b, v11.16b, \in1\().16b
196 .endm
197
198 .macro inv_mix_cols_2x, in0, in1
199 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
200 mul_by_x_2x v8, v9, v8, v9, v10, v11, v14
201 eor \in0\().16b, \in0\().16b, v8.16b
202 eor \in1\().16b, \in1\().16b, v9.16b
203 rev32 v8.8h, v8.8h
204 rev32 v9.8h, v9.8h
205 eor \in0\().16b, \in0\().16b, v8.16b
206 eor \in1\().16b, \in1\().16b, v9.16b
207 mix_columns_2x \in0, \in1
208 .endm
209
210 .macro inv_mix_cols_4x, in0, in1, in2, in3
211 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
212 mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14
213 mul_by_x_2x v8, v9, v8, v9, v12, v13, v14
214 mul_by_x_2x v10, v11, v10, v11, v12, v13, v14
215 eor \in0\().16b, \in0\().16b, v8.16b
216 eor \in1\().16b, \in1\().16b, v9.16b
217 eor \in2\().16b, \in2\().16b, v10.16b
218 eor \in3\().16b, \in3\().16b, v11.16b
219 rev32 v8.8h, v8.8h
220 rev32 v9.8h, v9.8h
221 rev32 v10.8h, v10.8h
222 rev32 v11.8h, v11.8h
223 eor \in0\().16b, \in0\().16b, v8.16b
224 eor \in1\().16b, \in1\().16b, v9.16b
225 eor \in2\().16b, \in2\().16b, v10.16b
226 eor \in3\().16b, \in3\().16b, v11.16b
227 mix_columns_2x \in0, \in1
228 mix_columns_2x \in2, \in3
229 .endm
230
231 .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i
232 ld1 {v15.16b}, [\rk]
233 add \rkp, \rk, #16
234 mov \i, \rounds
2351111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
236 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
237 sub_bytes_2x \in0, \in1
238 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
239 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
240 ld1 {v15.16b}, [\rkp], #16
241 subs \i, \i, #1
242 beq 2222f
243 .if \enc == 1
244 mix_columns_2x \in0, \in1
245 ldr q13, .LForward_ShiftRows
246 .else
247 inv_mix_cols_2x \in0, \in1
248 ldr q13, .LReverse_ShiftRows
249 .endif
250 movi v12.16b, #0x40
251 b 1111b
2522222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
253 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
254 .endm
255
256 .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
257 ld1 {v15.16b}, [\rk]
258 add \rkp, \rk, #16
259 mov \i, \rounds
2601111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
261 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
262 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
263 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
264 sub_bytes_4x \in0, \in1, \in2, \in3
265 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
266 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
267 tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
268 tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
269 ld1 {v15.16b}, [\rkp], #16
270 subs \i, \i, #1
271 beq 2222f
272 .if \enc == 1
273 mix_columns_2x \in0, \in1
274 mix_columns_2x \in2, \in3
275 ldr q13, .LForward_ShiftRows
276 .else
277 inv_mix_cols_4x \in0, \in1, \in2, \in3
278 ldr q13, .LReverse_ShiftRows
279 .endif
280 movi v12.16b, #0x40
281 b 1111b
2822222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
283 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
284 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
285 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
286 .endm
287
288 .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
289 do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
290 .endm
291
292 .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
293 do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
294 .endm
295
296 .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
297 do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
298 .endm
299
300 .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
301 do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
302 .endm
303
304#include "aes-modes.S"
305
306 .text
307 .align 4
308.LForward_ShiftRows:
309 .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
310 .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
311
312.LReverse_ShiftRows:
313 .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
314 .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
315
316.LForward_Sbox:
317 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
318 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
319 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
320 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
321 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
322 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
323 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
324 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
325 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
326 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
327 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
328 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
329 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
330 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
331 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
332 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
333 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
334 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
335 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
336 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
337 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
338 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
339 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
340 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
341 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
342 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
343 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
344 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
345 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
346 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
347 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
348 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
349
350.LReverse_Sbox:
351 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
352 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
353 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
354 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
355 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
356 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
357 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
358 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
359 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
360 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
361 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
362 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
363 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
364 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
365 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
366 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
367 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
368 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
369 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
370 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
371 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
372 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
373 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
374 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
375 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
376 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
377 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
378 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
379 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
380 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
381 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
382 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
new file mode 100644
index 000000000000..b9e6eaf41c9b
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -0,0 +1,95 @@
1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/assembler.h>
21
22 DATA .req v0
23 SHASH .req v1
24 IN1 .req v2
25 T1 .req v2
26 T2 .req v3
27 T3 .req v4
28 VZR .req v5
29
30 .text
31 .arch armv8-a+crypto
32
33 /*
34 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
35 * struct ghash_key const *k, const char *head)
36 */
37ENTRY(pmull_ghash_update)
38 ld1 {DATA.16b}, [x1]
39 ld1 {SHASH.16b}, [x3]
40 eor VZR.16b, VZR.16b, VZR.16b
41
42 /* do the head block first, if supplied */
43 cbz x4, 0f
44 ld1 {IN1.2d}, [x4]
45 b 1f
46
470: ld1 {IN1.2d}, [x2], #16
48 sub w0, w0, #1
491: ext IN1.16b, IN1.16b, IN1.16b, #8
50CPU_LE( rev64 IN1.16b, IN1.16b )
51 eor DATA.16b, DATA.16b, IN1.16b
52
53 /* multiply DATA by SHASH in GF(2^128) */
54 ext T2.16b, DATA.16b, DATA.16b, #8
55 ext T3.16b, SHASH.16b, SHASH.16b, #8
56 eor T2.16b, T2.16b, DATA.16b
57 eor T3.16b, T3.16b, SHASH.16b
58
59 pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
60 pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
61 pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
62 eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
63 eor T2.16b, T2.16b, DATA.16b
64
65 ext T3.16b, VZR.16b, T2.16b, #8
66 ext T2.16b, T2.16b, VZR.16b, #8
67 eor DATA.16b, DATA.16b, T3.16b
68 eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
69 // carry-less multiplication
70
71 /* first phase of the reduction */
72 shl T3.2d, DATA.2d, #1
73 eor T3.16b, T3.16b, DATA.16b
74 shl T3.2d, T3.2d, #5
75 eor T3.16b, T3.16b, DATA.16b
76 shl T3.2d, T3.2d, #57
77 ext T2.16b, VZR.16b, T3.16b, #8
78 ext T3.16b, T3.16b, VZR.16b, #8
79 eor DATA.16b, DATA.16b, T2.16b
80 eor T1.16b, T1.16b, T3.16b
81
82 /* second phase of the reduction */
83 ushr T2.2d, DATA.2d, #5
84 eor T2.16b, T2.16b, DATA.16b
85 ushr T2.2d, T2.2d, #1
86 eor T2.16b, T2.16b, DATA.16b
87 ushr T2.2d, T2.2d, #1
88 eor T1.16b, T1.16b, T2.16b
89 eor DATA.16b, DATA.16b, T1.16b
90
91 cbnz w0, 0b
92
93 st1 {DATA.16b}, [x1]
94 ret
95ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
new file mode 100644
index 000000000000..b92baf3f68c7
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -0,0 +1,155 @@
1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/internal/hash.h>
14#include <linux/cpufeature.h>
15#include <linux/crypto.h>
16#include <linux/module.h>
17
18MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
19MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
20MODULE_LICENSE("GPL v2");
21
22#define GHASH_BLOCK_SIZE 16
23#define GHASH_DIGEST_SIZE 16
24
25struct ghash_key {
26 u64 a;
27 u64 b;
28};
29
30struct ghash_desc_ctx {
31 u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
32 u8 buf[GHASH_BLOCK_SIZE];
33 u32 count;
34};
35
36asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
37 struct ghash_key const *k, const char *head);
38
39static int ghash_init(struct shash_desc *desc)
40{
41 struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
42
43 *ctx = (struct ghash_desc_ctx){};
44 return 0;
45}
46
47static int ghash_update(struct shash_desc *desc, const u8 *src,
48 unsigned int len)
49{
50 struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
51 unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
52
53 ctx->count += len;
54
55 if ((partial + len) >= GHASH_BLOCK_SIZE) {
56 struct ghash_key *key = crypto_shash_ctx(desc->tfm);
57 int blocks;
58
59 if (partial) {
60 int p = GHASH_BLOCK_SIZE - partial;
61
62 memcpy(ctx->buf + partial, src, p);
63 src += p;
64 len -= p;
65 }
66
67 blocks = len / GHASH_BLOCK_SIZE;
68 len %= GHASH_BLOCK_SIZE;
69
70 kernel_neon_begin_partial(6);
71 pmull_ghash_update(blocks, ctx->digest, src, key,
72 partial ? ctx->buf : NULL);
73 kernel_neon_end();
74 src += blocks * GHASH_BLOCK_SIZE;
75 }
76 if (len)
77 memcpy(ctx->buf + partial, src, len);
78 return 0;
79}
80
81static int ghash_final(struct shash_desc *desc, u8 *dst)
82{
83 struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
84 unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
85
86 if (partial) {
87 struct ghash_key *key = crypto_shash_ctx(desc->tfm);
88
89 memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
90
91 kernel_neon_begin_partial(6);
92 pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
93 kernel_neon_end();
94 }
95 put_unaligned_be64(ctx->digest[1], dst);
96 put_unaligned_be64(ctx->digest[0], dst + 8);
97
98 *ctx = (struct ghash_desc_ctx){};
99 return 0;
100}
101
102static int ghash_setkey(struct crypto_shash *tfm,
103 const u8 *inkey, unsigned int keylen)
104{
105 struct ghash_key *key = crypto_shash_ctx(tfm);
106 u64 a, b;
107
108 if (keylen != GHASH_BLOCK_SIZE) {
109 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
110 return -EINVAL;
111 }
112
113 /* perform multiplication by 'x' in GF(2^128) */
114 b = get_unaligned_be64(inkey);
115 a = get_unaligned_be64(inkey + 8);
116
117 key->a = (a << 1) | (b >> 63);
118 key->b = (b << 1) | (a >> 63);
119
120 if (b >> 63)
121 key->b ^= 0xc200000000000000UL;
122
123 return 0;
124}
125
126static struct shash_alg ghash_alg = {
127 .digestsize = GHASH_DIGEST_SIZE,
128 .init = ghash_init,
129 .update = ghash_update,
130 .final = ghash_final,
131 .setkey = ghash_setkey,
132 .descsize = sizeof(struct ghash_desc_ctx),
133 .base = {
134 .cra_name = "ghash",
135 .cra_driver_name = "ghash-ce",
136 .cra_priority = 200,
137 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
138 .cra_blocksize = GHASH_BLOCK_SIZE,
139 .cra_ctxsize = sizeof(struct ghash_key),
140 .cra_module = THIS_MODULE,
141 },
142};
143
144static int __init ghash_ce_mod_init(void)
145{
146 return crypto_register_shash(&ghash_alg);
147}
148
149static void __exit ghash_ce_mod_exit(void)
150{
151 crypto_unregister_shash(&ghash_alg);
152}
153
154module_cpu_feature_match(PMULL, ghash_ce_mod_init);
155module_exit(ghash_ce_mod_exit);
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
new file mode 100644
index 000000000000..09d57d98609c
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -0,0 +1,153 @@
1/*
2 * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14 .text
15 .arch armv8-a+crypto
16
17 k0 .req v0
18 k1 .req v1
19 k2 .req v2
20 k3 .req v3
21
22 t0 .req v4
23 t1 .req v5
24
25 dga .req q6
26 dgav .req v6
27 dgb .req s7
28 dgbv .req v7
29
30 dg0q .req q12
31 dg0s .req s12
32 dg0v .req v12
33 dg1s .req s13
34 dg1v .req v13
35 dg2s .req s14
36
37 .macro add_only, op, ev, rc, s0, dg1
38 .ifc \ev, ev
39 add t1.4s, v\s0\().4s, \rc\().4s
40 sha1h dg2s, dg0s
41 .ifnb \dg1
42 sha1\op dg0q, \dg1, t0.4s
43 .else
44 sha1\op dg0q, dg1s, t0.4s
45 .endif
46 .else
47 .ifnb \s0
48 add t0.4s, v\s0\().4s, \rc\().4s
49 .endif
50 sha1h dg1s, dg0s
51 sha1\op dg0q, dg2s, t1.4s
52 .endif
53 .endm
54
55 .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
56 sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
57 add_only \op, \ev, \rc, \s1, \dg1
58 sha1su1 v\s0\().4s, v\s3\().4s
59 .endm
60
61 /*
62 * The SHA1 round constants
63 */
64 .align 4
65.Lsha1_rcon:
66 .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
67
68 /*
69 * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
70 * u8 *head, long bytes)
71 */
72ENTRY(sha1_ce_transform)
73 /* load round constants */
74 adr x6, .Lsha1_rcon
75 ld1r {k0.4s}, [x6], #4
76 ld1r {k1.4s}, [x6], #4
77 ld1r {k2.4s}, [x6], #4
78 ld1r {k3.4s}, [x6]
79
80 /* load state */
81 ldr dga, [x2]
82 ldr dgb, [x2, #16]
83
84 /* load partial state (if supplied) */
85 cbz x3, 0f
86 ld1 {v8.4s-v11.4s}, [x3]
87 b 1f
88
89 /* load input */
900: ld1 {v8.4s-v11.4s}, [x1], #64
91 sub w0, w0, #1
92
931:
94CPU_LE( rev32 v8.16b, v8.16b )
95CPU_LE( rev32 v9.16b, v9.16b )
96CPU_LE( rev32 v10.16b, v10.16b )
97CPU_LE( rev32 v11.16b, v11.16b )
98
992: add t0.4s, v8.4s, k0.4s
100 mov dg0v.16b, dgav.16b
101
102 add_update c, ev, k0, 8, 9, 10, 11, dgb
103 add_update c, od, k0, 9, 10, 11, 8
104 add_update c, ev, k0, 10, 11, 8, 9
105 add_update c, od, k0, 11, 8, 9, 10
106 add_update c, ev, k1, 8, 9, 10, 11
107
108 add_update p, od, k1, 9, 10, 11, 8
109 add_update p, ev, k1, 10, 11, 8, 9
110 add_update p, od, k1, 11, 8, 9, 10
111 add_update p, ev, k1, 8, 9, 10, 11
112 add_update p, od, k2, 9, 10, 11, 8
113
114 add_update m, ev, k2, 10, 11, 8, 9
115 add_update m, od, k2, 11, 8, 9, 10
116 add_update m, ev, k2, 8, 9, 10, 11
117 add_update m, od, k2, 9, 10, 11, 8
118 add_update m, ev, k3, 10, 11, 8, 9
119
120 add_update p, od, k3, 11, 8, 9, 10
121 add_only p, ev, k3, 9
122 add_only p, od, k3, 10
123 add_only p, ev, k3, 11
124 add_only p, od
125
126 /* update state */
127 add dgbv.2s, dgbv.2s, dg1v.2s
128 add dgav.4s, dgav.4s, dg0v.4s
129
130 cbnz w0, 0b
131
132 /*
133 * Final block: add padding and total bit count.
134 * Skip if we have no total byte count in x4. In that case, the input
135 * size was not a round multiple of the block size, and the padding is
136 * handled by the C code.
137 */
138 cbz x4, 3f
139 movi v9.2d, #0
140 mov x8, #0x80000000
141 movi v10.2d, #0
142 ror x7, x4, #29 // ror(lsl(x4, 3), 32)
143 fmov d8, x8
144 mov x4, #0
145 mov v11.d[0], xzr
146 mov v11.d[1], x7
147 b 2b
148
149 /* store new state */
1503: str dga, [x2]
151 str dgb, [x2, #16]
152 ret
153ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
new file mode 100644
index 000000000000..6fe83f37a750
--- /dev/null
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -0,0 +1,174 @@
1/*
2 * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/internal/hash.h>
14#include <crypto/sha.h>
15#include <linux/cpufeature.h>
16#include <linux/crypto.h>
17#include <linux/module.h>
18
19MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
20MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
21MODULE_LICENSE("GPL v2");
22
23asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
24 u8 *head, long bytes);
25
26static int sha1_init(struct shash_desc *desc)
27{
28 struct sha1_state *sctx = shash_desc_ctx(desc);
29
30 *sctx = (struct sha1_state){
31 .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
32 };
33 return 0;
34}
35
36static int sha1_update(struct shash_desc *desc, const u8 *data,
37 unsigned int len)
38{
39 struct sha1_state *sctx = shash_desc_ctx(desc);
40 unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
41
42 sctx->count += len;
43
44 if ((partial + len) >= SHA1_BLOCK_SIZE) {
45 int blocks;
46
47 if (partial) {
48 int p = SHA1_BLOCK_SIZE - partial;
49
50 memcpy(sctx->buffer + partial, data, p);
51 data += p;
52 len -= p;
53 }
54
55 blocks = len / SHA1_BLOCK_SIZE;
56 len %= SHA1_BLOCK_SIZE;
57
58 kernel_neon_begin_partial(16);
59 sha1_ce_transform(blocks, data, sctx->state,
60 partial ? sctx->buffer : NULL, 0);
61 kernel_neon_end();
62
63 data += blocks * SHA1_BLOCK_SIZE;
64 partial = 0;
65 }
66 if (len)
67 memcpy(sctx->buffer + partial, data, len);
68 return 0;
69}
70
71static int sha1_final(struct shash_desc *desc, u8 *out)
72{
73 static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
74
75 struct sha1_state *sctx = shash_desc_ctx(desc);
76 __be64 bits = cpu_to_be64(sctx->count << 3);
77 __be32 *dst = (__be32 *)out;
78 int i;
79
80 u32 padlen = SHA1_BLOCK_SIZE
81 - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);
82
83 sha1_update(desc, padding, padlen);
84 sha1_update(desc, (const u8 *)&bits, sizeof(bits));
85
86 for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
87 put_unaligned_be32(sctx->state[i], dst++);
88
89 *sctx = (struct sha1_state){};
90 return 0;
91}
92
93static int sha1_finup(struct shash_desc *desc, const u8 *data,
94 unsigned int len, u8 *out)
95{
96 struct sha1_state *sctx = shash_desc_ctx(desc);
97 __be32 *dst = (__be32 *)out;
98 int blocks;
99 int i;
100
101 if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
102 sha1_update(desc, data, len);
103 return sha1_final(desc, out);
104 }
105
106 /*
107 * Use a fast path if the input is a multiple of 64 bytes. In
108 * this case, there is no need to copy data around, and we can
109 * perform the entire digest calculation in a single invocation
110 * of sha1_ce_transform()
111 */
112 blocks = len / SHA1_BLOCK_SIZE;
113
114 kernel_neon_begin_partial(16);
115 sha1_ce_transform(blocks, data, sctx->state, NULL, len);
116 kernel_neon_end();
117
118 for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
119 put_unaligned_be32(sctx->state[i], dst++);
120
121 *sctx = (struct sha1_state){};
122 return 0;
123}
124
125static int sha1_export(struct shash_desc *desc, void *out)
126{
127 struct sha1_state *sctx = shash_desc_ctx(desc);
128 struct sha1_state *dst = out;
129
130 *dst = *sctx;
131 return 0;
132}
133
134static int sha1_import(struct shash_desc *desc, const void *in)
135{
136 struct sha1_state *sctx = shash_desc_ctx(desc);
137 struct sha1_state const *src = in;
138
139 *sctx = *src;
140 return 0;
141}
142
143static struct shash_alg alg = {
144 .init = sha1_init,
145 .update = sha1_update,
146 .final = sha1_final,
147 .finup = sha1_finup,
148 .export = sha1_export,
149 .import = sha1_import,
150 .descsize = sizeof(struct sha1_state),
151 .digestsize = SHA1_DIGEST_SIZE,
152 .statesize = sizeof(struct sha1_state),
153 .base = {
154 .cra_name = "sha1",
155 .cra_driver_name = "sha1-ce",
156 .cra_priority = 200,
157 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
158 .cra_blocksize = SHA1_BLOCK_SIZE,
159 .cra_module = THIS_MODULE,
160 }
161};
162
163static int __init sha1_ce_mod_init(void)
164{
165 return crypto_register_shash(&alg);
166}
167
168static void __exit sha1_ce_mod_fini(void)
169{
170 crypto_unregister_shash(&alg);
171}
172
173module_cpu_feature_match(SHA1, sha1_ce_mod_init);
174module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
new file mode 100644
index 000000000000..7f29fc031ea8
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -0,0 +1,156 @@
1/*
2 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14 .text
15 .arch armv8-a+crypto
16
17 dga .req q20
18 dgav .req v20
19 dgb .req q21
20 dgbv .req v21
21
22 t0 .req v22
23 t1 .req v23
24
25 dg0q .req q24
26 dg0v .req v24
27 dg1q .req q25
28 dg1v .req v25
29 dg2q .req q26
30 dg2v .req v26
31
32 .macro add_only, ev, rc, s0
33 mov dg2v.16b, dg0v.16b
34 .ifeq \ev
35 add t1.4s, v\s0\().4s, \rc\().4s
36 sha256h dg0q, dg1q, t0.4s
37 sha256h2 dg1q, dg2q, t0.4s
38 .else
39 .ifnb \s0
40 add t0.4s, v\s0\().4s, \rc\().4s
41 .endif
42 sha256h dg0q, dg1q, t1.4s
43 sha256h2 dg1q, dg2q, t1.4s
44 .endif
45 .endm
46
47 .macro add_update, ev, rc, s0, s1, s2, s3
48 sha256su0 v\s0\().4s, v\s1\().4s
49 add_only \ev, \rc, \s1
50 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
51 .endm
52
53 /*
54 * The SHA-256 round constants
55 */
56 .align 4
57.Lsha2_rcon:
58 .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
59 .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
60 .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
61 .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
62 .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
63 .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
64 .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
65 .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
66 .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
67 .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
68 .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
69 .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
70 .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
71 .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
72 .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
73 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
74
75 /*
76 * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
77 * u8 *head, long bytes)
78 */
79ENTRY(sha2_ce_transform)
80 /* load round constants */
81 adr x8, .Lsha2_rcon
82 ld1 { v0.4s- v3.4s}, [x8], #64
83 ld1 { v4.4s- v7.4s}, [x8], #64
84 ld1 { v8.4s-v11.4s}, [x8], #64
85 ld1 {v12.4s-v15.4s}, [x8]
86
87 /* load state */
88 ldp dga, dgb, [x2]
89
90 /* load partial input (if supplied) */
91 cbz x3, 0f
92 ld1 {v16.4s-v19.4s}, [x3]
93 b 1f
94
95 /* load input */
960: ld1 {v16.4s-v19.4s}, [x1], #64
97 sub w0, w0, #1
98
991:
100CPU_LE( rev32 v16.16b, v16.16b )
101CPU_LE( rev32 v17.16b, v17.16b )
102CPU_LE( rev32 v18.16b, v18.16b )
103CPU_LE( rev32 v19.16b, v19.16b )
104
1052: add t0.4s, v16.4s, v0.4s
106 mov dg0v.16b, dgav.16b
107 mov dg1v.16b, dgbv.16b
108
109 add_update 0, v1, 16, 17, 18, 19
110 add_update 1, v2, 17, 18, 19, 16
111 add_update 0, v3, 18, 19, 16, 17
112 add_update 1, v4, 19, 16, 17, 18
113
114 add_update 0, v5, 16, 17, 18, 19
115 add_update 1, v6, 17, 18, 19, 16
116 add_update 0, v7, 18, 19, 16, 17
117 add_update 1, v8, 19, 16, 17, 18
118
119 add_update 0, v9, 16, 17, 18, 19
120 add_update 1, v10, 17, 18, 19, 16
121 add_update 0, v11, 18, 19, 16, 17
122 add_update 1, v12, 19, 16, 17, 18
123
124 add_only 0, v13, 17
125 add_only 1, v14, 18
126 add_only 0, v15, 19
127 add_only 1
128
129 /* update state */
130 add dgav.4s, dgav.4s, dg0v.4s
131 add dgbv.4s, dgbv.4s, dg1v.4s
132
133 /* handled all input blocks? */
134 cbnz w0, 0b
135
136 /*
137 * Final block: add padding and total bit count.
138 * Skip if we have no total byte count in x4. In that case, the input
139 * size was not a round multiple of the block size, and the padding is
140 * handled by the C code.
141 */
142 cbz x4, 3f
143 movi v17.2d, #0
144 mov x8, #0x80000000
145 movi v18.2d, #0
146 ror x7, x4, #29 // ror(lsl(x4, 3), 32)
147 fmov d16, x8
148 mov x4, #0
149 mov v19.d[0], xzr
150 mov v19.d[1], x7
151 b 2b
152
153 /* store new state */
1543: stp dga, dgb, [x2]
155 ret
156ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
new file mode 100644
index 000000000000..c294e67d3925
--- /dev/null
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -0,0 +1,255 @@
1/*
2 * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <asm/unaligned.h>
13#include <crypto/internal/hash.h>
14#include <crypto/sha.h>
15#include <linux/cpufeature.h>
16#include <linux/crypto.h>
17#include <linux/module.h>
18
19MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
20MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
21MODULE_LICENSE("GPL v2");
22
23asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
24 u8 *head, long bytes);
25
26static int sha224_init(struct shash_desc *desc)
27{
28 struct sha256_state *sctx = shash_desc_ctx(desc);
29
30 *sctx = (struct sha256_state){
31 .state = {
32 SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
33 SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
34 }
35 };
36 return 0;
37}
38
39static int sha256_init(struct shash_desc *desc)
40{
41 struct sha256_state *sctx = shash_desc_ctx(desc);
42
43 *sctx = (struct sha256_state){
44 .state = {
45 SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
46 SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
47 }
48 };
49 return 0;
50}
51
52static int sha2_update(struct shash_desc *desc, const u8 *data,
53 unsigned int len)
54{
55 struct sha256_state *sctx = shash_desc_ctx(desc);
56 unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
57
58 sctx->count += len;
59
60 if ((partial + len) >= SHA256_BLOCK_SIZE) {
61 int blocks;
62
63 if (partial) {
64 int p = SHA256_BLOCK_SIZE - partial;
65
66 memcpy(sctx->buf + partial, data, p);
67 data += p;
68 len -= p;
69 }
70
71 blocks = len / SHA256_BLOCK_SIZE;
72 len %= SHA256_BLOCK_SIZE;
73
74 kernel_neon_begin_partial(28);
75 sha2_ce_transform(blocks, data, sctx->state,
76 partial ? sctx->buf : NULL, 0);
77 kernel_neon_end();
78
79 data += blocks * SHA256_BLOCK_SIZE;
80 partial = 0;
81 }
82 if (len)
83 memcpy(sctx->buf + partial, data, len);
84 return 0;
85}
86
87static void sha2_final(struct shash_desc *desc)
88{
89 static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
90
91 struct sha256_state *sctx = shash_desc_ctx(desc);
92 __be64 bits = cpu_to_be64(sctx->count << 3);
93 u32 padlen = SHA256_BLOCK_SIZE
94 - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE);
95
96 sha2_update(desc, padding, padlen);
97 sha2_update(desc, (const u8 *)&bits, sizeof(bits));
98}
99
100static int sha224_final(struct shash_desc *desc, u8 *out)
101{
102 struct sha256_state *sctx = shash_desc_ctx(desc);
103 __be32 *dst = (__be32 *)out;
104 int i;
105
106 sha2_final(desc);
107
108 for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
109 put_unaligned_be32(sctx->state[i], dst++);
110
111 *sctx = (struct sha256_state){};
112 return 0;
113}
114
115static int sha256_final(struct shash_desc *desc, u8 *out)
116{
117 struct sha256_state *sctx = shash_desc_ctx(desc);
118 __be32 *dst = (__be32 *)out;
119 int i;
120
121 sha2_final(desc);
122
123 for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
124 put_unaligned_be32(sctx->state[i], dst++);
125
126 *sctx = (struct sha256_state){};
127 return 0;
128}
129
130static void sha2_finup(struct shash_desc *desc, const u8 *data,
131 unsigned int len)
132{
133 struct sha256_state *sctx = shash_desc_ctx(desc);
134 int blocks;
135
136 if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
137 sha2_update(desc, data, len);
138 sha2_final(desc);
139 return;
140 }
141
142 /*
143 * Use a fast path if the input is a multiple of 64 bytes. In
144 * this case, there is no need to copy data around, and we can
145 * perform the entire digest calculation in a single invocation
146 * of sha2_ce_transform()
147 */
148 blocks = len / SHA256_BLOCK_SIZE;
149
150 kernel_neon_begin_partial(28);
151 sha2_ce_transform(blocks, data, sctx->state, NULL, len);
152 kernel_neon_end();
153 data += blocks * SHA256_BLOCK_SIZE;
154}
155
156static int sha224_finup(struct shash_desc *desc, const u8 *data,
157 unsigned int len, u8 *out)
158{
159 struct sha256_state *sctx = shash_desc_ctx(desc);
160 __be32 *dst = (__be32 *)out;
161 int i;
162
163 sha2_finup(desc, data, len);
164
165 for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
166 put_unaligned_be32(sctx->state[i], dst++);
167
168 *sctx = (struct sha256_state){};
169 return 0;
170}
171
172static int sha256_finup(struct shash_desc *desc, const u8 *data,
173 unsigned int len, u8 *out)
174{
175 struct sha256_state *sctx = shash_desc_ctx(desc);
176 __be32 *dst = (__be32 *)out;
177 int i;
178
179 sha2_finup(desc, data, len);
180
181 for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
182 put_unaligned_be32(sctx->state[i], dst++);
183
184 *sctx = (struct sha256_state){};
185 return 0;
186}
187
188static int sha2_export(struct shash_desc *desc, void *out)
189{
190 struct sha256_state *sctx = shash_desc_ctx(desc);
191 struct sha256_state *dst = out;
192
193 *dst = *sctx;
194 return 0;
195}
196
197static int sha2_import(struct shash_desc *desc, const void *in)
198{
199 struct sha256_state *sctx = shash_desc_ctx(desc);
200 struct sha256_state const *src = in;
201
202 *sctx = *src;
203 return 0;
204}
205
206static struct shash_alg algs[] = { {
207 .init = sha224_init,
208 .update = sha2_update,
209 .final = sha224_final,
210 .finup = sha224_finup,
211 .export = sha2_export,
212 .import = sha2_import,
213 .descsize = sizeof(struct sha256_state),
214 .digestsize = SHA224_DIGEST_SIZE,
215 .statesize = sizeof(struct sha256_state),
216 .base = {
217 .cra_name = "sha224",
218 .cra_driver_name = "sha224-ce",
219 .cra_priority = 200,
220 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
221 .cra_blocksize = SHA256_BLOCK_SIZE,
222 .cra_module = THIS_MODULE,
223 }
224}, {
225 .init = sha256_init,
226 .update = sha2_update,
227 .final = sha256_final,
228 .finup = sha256_finup,
229 .export = sha2_export,
230 .import = sha2_import,
231 .descsize = sizeof(struct sha256_state),
232 .digestsize = SHA256_DIGEST_SIZE,
233 .statesize = sizeof(struct sha256_state),
234 .base = {
235 .cra_name = "sha256",
236 .cra_driver_name = "sha256-ce",
237 .cra_priority = 200,
238 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
239 .cra_blocksize = SHA256_BLOCK_SIZE,
240 .cra_module = THIS_MODULE,
241 }
242} };
243
244static int __init sha2_ce_mod_init(void)
245{
246 return crypto_register_shashes(algs, ARRAY_SIZE(algs));
247}
248
249static void __exit sha2_ce_mod_fini(void)
250{
251 crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
252}
253
254module_cpu_feature_match(SHA2, sha2_ce_mod_init);
255module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 83f71b3004a8..42c7eecd2bb6 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
40generic-y += sembuf.h 40generic-y += sembuf.h
41generic-y += serial.h 41generic-y += serial.h
42generic-y += shmbuf.h 42generic-y += shmbuf.h
43generic-y += simd.h
43generic-y += sizes.h 44generic-y += sizes.h
44generic-y += socket.h 45generic-y += socket.h
45generic-y += sockios.h 46generic-y += sockios.h
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index fd3e3924041b..5901480bfdca 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -21,6 +21,7 @@
21#endif 21#endif
22 22
23#include <asm/ptrace.h> 23#include <asm/ptrace.h>
24#include <asm/thread_info.h>
24 25
25/* 26/*
26 * Stack pushing/popping (register pairs only). Equivalent to store decrement 27 * Stack pushing/popping (register pairs only). Equivalent to store decrement
@@ -68,23 +69,31 @@
68 msr daifclr, #8 69 msr daifclr, #8
69 .endm 70 .endm
70 71
71 .macro disable_step, tmp 72 .macro disable_step_tsk, flgs, tmp
73 tbz \flgs, #TIF_SINGLESTEP, 9990f
72 mrs \tmp, mdscr_el1 74 mrs \tmp, mdscr_el1
73 bic \tmp, \tmp, #1 75 bic \tmp, \tmp, #1
74 msr mdscr_el1, \tmp 76 msr mdscr_el1, \tmp
77 isb // Synchronise with enable_dbg
789990:
75 .endm 79 .endm
76 80
77 .macro enable_step, tmp 81 .macro enable_step_tsk, flgs, tmp
82 tbz \flgs, #TIF_SINGLESTEP, 9990f
83 disable_dbg
78 mrs \tmp, mdscr_el1 84 mrs \tmp, mdscr_el1
79 orr \tmp, \tmp, #1 85 orr \tmp, \tmp, #1
80 msr mdscr_el1, \tmp 86 msr mdscr_el1, \tmp
879990:
81 .endm 88 .endm
82 89
83 .macro enable_dbg_if_not_stepping, tmp 90/*
84 mrs \tmp, mdscr_el1 91 * Enable both debug exceptions and interrupts. This is likely to be
85 tbnz \tmp, #0, 9990f 92 * faster than two daifclr operations, since writes to this register
86 enable_dbg 93 * are self-synchronising.
879990: 94 */
95 .macro enable_dbg_and_irq
96 msr daifclr, #(8 | 2)
88 .endm 97 .endm
89 98
90/* 99/*
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 57e8cb49824c..65f1569ac96e 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -157,7 +157,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
157 */ 157 */
158#define ATOMIC64_INIT(i) { (i) } 158#define ATOMIC64_INIT(i) { (i) }
159 159
160#define atomic64_read(v) (*(volatile long long *)&(v)->counter) 160#define atomic64_read(v) (*(volatile long *)&(v)->counter)
161#define atomic64_set(v,i) (((v)->counter) = (i)) 161#define atomic64_set(v,i) (((v)->counter) = (i))
162 162
163static inline void atomic64_add(u64 i, atomic64_t *v) 163static inline void atomic64_add(u64 i, atomic64_t *v)
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 48b9e704af7c..6389d60574d9 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -25,12 +25,12 @@
25#define wfi() asm volatile("wfi" : : : "memory") 25#define wfi() asm volatile("wfi" : : : "memory")
26 26
27#define isb() asm volatile("isb" : : : "memory") 27#define isb() asm volatile("isb" : : : "memory")
28#define dmb(opt) asm volatile("dmb sy" : : : "memory") 28#define dmb(opt) asm volatile("dmb " #opt : : : "memory")
29#define dsb(opt) asm volatile("dsb sy" : : : "memory") 29#define dsb(opt) asm volatile("dsb " #opt : : : "memory")
30 30
31#define mb() dsb() 31#define mb() dsb(sy)
32#define rmb() asm volatile("dsb ld" : : : "memory") 32#define rmb() dsb(ld)
33#define wmb() asm volatile("dsb st" : : : "memory") 33#define wmb() dsb(st)
34 34
35#ifndef CONFIG_SMP 35#ifndef CONFIG_SMP
36#define smp_mb() barrier() 36#define smp_mb() barrier()
@@ -40,7 +40,7 @@
40#define smp_store_release(p, v) \ 40#define smp_store_release(p, v) \
41do { \ 41do { \
42 compiletime_assert_atomic_type(*p); \ 42 compiletime_assert_atomic_type(*p); \
43 smp_mb(); \ 43 barrier(); \
44 ACCESS_ONCE(*p) = (v); \ 44 ACCESS_ONCE(*p) = (v); \
45} while (0) 45} while (0)
46 46
@@ -48,15 +48,15 @@ do { \
48({ \ 48({ \
49 typeof(*p) ___p1 = ACCESS_ONCE(*p); \ 49 typeof(*p) ___p1 = ACCESS_ONCE(*p); \
50 compiletime_assert_atomic_type(*p); \ 50 compiletime_assert_atomic_type(*p); \
51 smp_mb(); \ 51 barrier(); \
52 ___p1; \ 52 ___p1; \
53}) 53})
54 54
55#else 55#else
56 56
57#define smp_mb() asm volatile("dmb ish" : : : "memory") 57#define smp_mb() dmb(ish)
58#define smp_rmb() asm volatile("dmb ishld" : : : "memory") 58#define smp_rmb() dmb(ishld)
59#define smp_wmb() asm volatile("dmb ishst" : : : "memory") 59#define smp_wmb() dmb(ishst)
60 60
61#define smp_store_release(p, v) \ 61#define smp_store_release(p, v) \
62do { \ 62do { \
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 390308a67f0d..88cc05b5f3ac 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -16,6 +16,8 @@
16#ifndef __ASM_CACHE_H 16#ifndef __ASM_CACHE_H
17#define __ASM_CACHE_H 17#define __ASM_CACHE_H
18 18
19#include <asm/cachetype.h>
20
19#define L1_CACHE_SHIFT 6 21#define L1_CACHE_SHIFT 6
20#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) 22#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
21 23
@@ -27,6 +29,15 @@
27 * the CPU. 29 * the CPU.
28 */ 30 */
29#define ARCH_DMA_MINALIGN L1_CACHE_BYTES 31#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
30#define ARCH_SLAB_MINALIGN 8 32
33#ifndef __ASSEMBLY__
34
35static inline int cache_line_size(void)
36{
37 u32 cwg = cache_type_cwg();
38 return cwg ? 4 << cwg : L1_CACHE_BYTES;
39}
40
41#endif /* __ASSEMBLY__ */
31 42
32#endif 43#endif
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 4c60e64a801c..a5176cf32dad 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -123,7 +123,7 @@ extern void flush_dcache_page(struct page *);
123static inline void __flush_icache_all(void) 123static inline void __flush_icache_all(void)
124{ 124{
125 asm("ic ialluis"); 125 asm("ic ialluis");
126 dsb(); 126 dsb(ish);
127} 127}
128 128
129#define flush_dcache_mmap_lock(mapping) \ 129#define flush_dcache_mmap_lock(mapping) \
@@ -150,7 +150,7 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end)
150 * set_pte_at() called from vmap_pte_range() does not 150 * set_pte_at() called from vmap_pte_range() does not
151 * have a DSB after cleaning the cache line. 151 * have a DSB after cleaning the cache line.
152 */ 152 */
153 dsb(); 153 dsb(ish);
154} 154}
155 155
156static inline void flush_cache_vunmap(unsigned long start, unsigned long end) 156static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
diff --git a/arch/arm64/include/asm/cachetype.h b/arch/arm64/include/asm/cachetype.h
index 85f5f511352a..4b23e758d5e0 100644
--- a/arch/arm64/include/asm/cachetype.h
+++ b/arch/arm64/include/asm/cachetype.h
@@ -20,12 +20,16 @@
20 20
21#define CTR_L1IP_SHIFT 14 21#define CTR_L1IP_SHIFT 14
22#define CTR_L1IP_MASK 3 22#define CTR_L1IP_MASK 3
23#define CTR_CWG_SHIFT 24
24#define CTR_CWG_MASK 15
23 25
24#define ICACHE_POLICY_RESERVED 0 26#define ICACHE_POLICY_RESERVED 0
25#define ICACHE_POLICY_AIVIVT 1 27#define ICACHE_POLICY_AIVIVT 1
26#define ICACHE_POLICY_VIPT 2 28#define ICACHE_POLICY_VIPT 2
27#define ICACHE_POLICY_PIPT 3 29#define ICACHE_POLICY_PIPT 3
28 30
31#ifndef __ASSEMBLY__
32
29static inline u32 icache_policy(void) 33static inline u32 icache_policy(void)
30{ 34{
31 return (read_cpuid_cachetype() >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK; 35 return (read_cpuid_cachetype() >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK;
@@ -45,4 +49,11 @@ static inline int icache_is_aivivt(void)
45 return icache_policy() == ICACHE_POLICY_AIVIVT; 49 return icache_policy() == ICACHE_POLICY_AIVIVT;
46} 50}
47 51
52static inline u32 cache_type_cwg(void)
53{
54 return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK;
55}
56
57#endif /* __ASSEMBLY__ */
58
48#endif /* __ASM_CACHETYPE_H */ 59#endif /* __ASM_CACHETYPE_H */
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 57c0fa7bf711..ddb9d7830558 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -72,7 +72,12 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
72} 72}
73 73
74#define xchg(ptr,x) \ 74#define xchg(ptr,x) \
75 ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) 75({ \
76 __typeof__(*(ptr)) __ret; \
77 __ret = (__typeof__(*(ptr))) \
78 __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \
79 __ret; \
80})
76 81
77static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 82static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
78 unsigned long new, int size) 83 unsigned long new, int size)
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index e71f81fe127a..253e33bc94fb 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -305,11 +305,6 @@ static inline int is_compat_thread(struct thread_info *thread)
305 305
306#else /* !CONFIG_COMPAT */ 306#else /* !CONFIG_COMPAT */
307 307
308static inline int is_compat_task(void)
309{
310 return 0;
311}
312
313static inline int is_compat_thread(struct thread_info *thread) 308static inline int is_compat_thread(struct thread_info *thread)
314{ 309{
315 return 0; 310 return 0;
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index c4a7f940b387..72674f4c3871 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -18,9 +18,11 @@
18#ifndef __ASM_ESR_H 18#ifndef __ASM_ESR_H
19#define __ASM_ESR_H 19#define __ASM_ESR_H
20 20
21#define ESR_EL1_EC_SHIFT (26) 21#define ESR_EL1_WRITE (1 << 6)
22#define ESR_EL1_IL (1U << 25) 22#define ESR_EL1_CM (1 << 8)
23#define ESR_EL1_IL (1 << 25)
23 24
25#define ESR_EL1_EC_SHIFT (26)
24#define ESR_EL1_EC_UNKNOWN (0x00) 26#define ESR_EL1_EC_UNKNOWN (0x00)
25#define ESR_EL1_EC_WFI (0x01) 27#define ESR_EL1_EC_WFI (0x01)
26#define ESR_EL1_EC_CP15_32 (0x03) 28#define ESR_EL1_EC_CP15_32 (0x03)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac13008..50f559f574fe 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -37,8 +37,21 @@ struct fpsimd_state {
37 u32 fpcr; 37 u32 fpcr;
38 }; 38 };
39 }; 39 };
40 /* the id of the last cpu to have restored this state */
41 unsigned int cpu;
40}; 42};
41 43
44/*
45 * Struct for stacking the bottom 'n' FP/SIMD registers.
46 */
47struct fpsimd_partial_state {
48 u32 fpsr;
49 u32 fpcr;
50 u32 num_regs;
51 __uint128_t vregs[32];
52};
53
54
42#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 55#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
43/* Masks for extracting the FPSR and FPCR from the FPSCR */ 56/* Masks for extracting the FPSR and FPCR from the FPSCR */
44#define VFP_FPSCR_STAT_MASK 0xf800009f 57#define VFP_FPSCR_STAT_MASK 0xf800009f
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state);
58extern void fpsimd_thread_switch(struct task_struct *next); 71extern void fpsimd_thread_switch(struct task_struct *next);
59extern void fpsimd_flush_thread(void); 72extern void fpsimd_flush_thread(void);
60 73
74extern void fpsimd_preserve_current_state(void);
75extern void fpsimd_restore_current_state(void);
76extern void fpsimd_update_current_state(struct fpsimd_state *state);
77
78extern void fpsimd_flush_task_state(struct task_struct *target);
79
80extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
81 u32 num_regs);
82extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
83
61#endif 84#endif
62 85
63#endif 86#endif
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index bbec599c96bd..768414d55e64 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,38 @@
62 ldr w\tmpnr, [\state, #16 * 2 + 4] 62 ldr w\tmpnr, [\state, #16 * 2 + 4]
63 msr fpcr, x\tmpnr 63 msr fpcr, x\tmpnr
64.endm 64.endm
65
66.altmacro
67.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
68 mrs x\tmpnr1, fpsr
69 str w\numnr, [\state, #8]
70 mrs x\tmpnr2, fpcr
71 stp w\tmpnr1, w\tmpnr2, [\state]
72 adr x\tmpnr1, 0f
73 add \state, \state, x\numnr, lsl #4
74 sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
75 br x\tmpnr1
76 .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
77 .irp qb, %(qa + 1)
78 stp q\qa, q\qb, [\state, # -16 * \qa - 16]
79 .endr
80 .endr
810:
82.endm
83
84.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
85 ldp w\tmpnr1, w\tmpnr2, [\state]
86 msr fpsr, x\tmpnr1
87 msr fpcr, x\tmpnr2
88 adr x\tmpnr1, 0f
89 ldr w\tmpnr2, [\state, #8]
90 add \state, \state, x\tmpnr2, lsl #4
91 sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
92 br x\tmpnr1
93 .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
94 .irp qb, %(qa + 1)
95 ldp q\qa, q\qb, [\state, # -16 * \qa - 16]
96 .endr
97 .endr
980:
99.endm
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
new file mode 100644
index 000000000000..c5534facf941
--- /dev/null
+++ b/arch/arm64/include/asm/ftrace.h
@@ -0,0 +1,59 @@
1/*
2 * arch/arm64/include/asm/ftrace.h
3 *
4 * Copyright (C) 2013 Linaro Limited
5 * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef __ASM_FTRACE_H
12#define __ASM_FTRACE_H
13
14#include <asm/insn.h>
15
16#define MCOUNT_ADDR ((unsigned long)_mcount)
17#define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE
18
19#ifndef __ASSEMBLY__
20#include <linux/compat.h>
21
22extern void _mcount(unsigned long);
23extern void *return_address(unsigned int);
24
25struct dyn_arch_ftrace {
26 /* No extra data needed for arm64 */
27};
28
29extern unsigned long ftrace_graph_call;
30
31static inline unsigned long ftrace_call_adjust(unsigned long addr)
32{
33 /*
34 * addr is the address of the mcount call instruction.
35 * recordmcount does the necessary offset calculation.
36 */
37 return addr;
38}
39
40#define ftrace_return_address(n) return_address(n)
41
42/*
43 * Because AArch32 mode does not share the same syscall table with AArch64,
44 * tracing compat syscalls may result in reporting bogus syscalls or even
45 * hang-up, so just do not trace them.
46 * See kernel/trace/trace_syscalls.c
47 *
48 * x86 code says:
49 * If the user realy wants these, then they should use the
50 * raw syscall tracepoints with filtering.
51 */
52#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
53static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
54{
55 return is_compat_task();
56}
57#endif /* ifndef __ASSEMBLY__ */
58
59#endif /* __ASM_FTRACE_H */
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index ae4801d77514..0be67821f9ce 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -20,7 +20,7 @@
20#include <linux/threads.h> 20#include <linux/threads.h>
21#include <asm/irq.h> 21#include <asm/irq.h>
22 22
23#define NR_IPI 5 23#define NR_IPI 6
24 24
25typedef struct { 25typedef struct {
26 unsigned int __softirq_pending; 26 unsigned int __softirq_pending;
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index c44ad39ed310..dc1f73b13e74 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -21,6 +21,7 @@
21/* A64 instructions are always 32 bits. */ 21/* A64 instructions are always 32 bits. */
22#define AARCH64_INSN_SIZE 4 22#define AARCH64_INSN_SIZE 4
23 23
24#ifndef __ASSEMBLY__
24/* 25/*
25 * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a 26 * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
26 * Section C3.1 "A64 instruction index by encoding": 27 * Section C3.1 "A64 instruction index by encoding":
@@ -104,5 +105,6 @@ bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn);
104int aarch64_insn_patch_text_nosync(void *addr, u32 insn); 105int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
105int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); 106int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt);
106int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); 107int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
108#endif /* __ASSEMBLY__ */
107 109
108#endif /* __ASM_INSN_H */ 110#endif /* __ASM_INSN_H */
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index a1bef78f0303..e0ecdcf6632d 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -230,19 +230,11 @@ extern void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot
230extern void __iounmap(volatile void __iomem *addr); 230extern void __iounmap(volatile void __iomem *addr);
231extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); 231extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
232 232
233#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_DIRTY)
234#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
235#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL_NC))
236#define PROT_NORMAL (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
237
238#define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) 233#define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
239#define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) 234#define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
240#define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC)) 235#define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC))
241#define iounmap __iounmap 236#define iounmap __iounmap
242 237
243#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF)
244#define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PTE_PXN | PTE_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
245
246#define ARCH_HAS_IOREMAP_WC 238#define ARCH_HAS_IOREMAP_WC
247#include <asm-generic/iomap.h> 239#include <asm-generic/iomap.h>
248 240
diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h
index b0cc58a97780..13ce4cc18e26 100644
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,11 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10 10
11#include <linux/types.h>
12
11#define cpu_has_neon() (1) 13#define cpu_has_neon() (1)
12 14
13void kernel_neon_begin(void); 15#define kernel_neon_begin() kernel_neon_begin_partial(32)
16
17void kernel_neon_begin_partial(u32 num_regs);
14void kernel_neon_end(void); 18void kernel_neon_end(void);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5fc8a66c3924..955e8c5f0afb 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -29,6 +29,8 @@
29 */ 29 */
30 30
31#define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1) 31#define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1)
32#define PUD_TYPE_MASK (_AT(pgdval_t, 3) << 0)
33#define PUD_TYPE_SECT (_AT(pgdval_t, 1) << 0)
32 34
33/* 35/*
34 * Level 2 descriptor (PMD). 36 * Level 2 descriptor (PMD).
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e2f96748859b..598cc384fc1c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -52,66 +52,59 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
52#endif 52#endif
53#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) 53#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
54 54
55/* 55#ifdef CONFIG_SMP
56 * The pgprot_* and protection_map entries will be fixed up at runtime to 56#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
57 * include the cachable and bufferable bits based on memory policy, as well as 57#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
58 * any architecture dependent bits like global/ASID and SMP shared mapping 58#else
59 * bits. 59#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF)
60 */ 60#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF)
61#define _PAGE_DEFAULT PTE_TYPE_PAGE | PTE_AF 61#endif
62 62
63extern pgprot_t pgprot_default; 63#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
64#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
65#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL))
64 66
65#define __pgprot_modify(prot,mask,bits) \ 67#define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
66 __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) 68#define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
69#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
67 70
68#define _MOD_PROT(p, b) __pgprot_modify(p, 0, b) 71#define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
69 72
70#define PAGE_NONE __pgprot_modify(pgprot_default, PTE_TYPE_MASK, PTE_PROT_NONE | PTE_PXN | PTE_UXN) 73#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
71#define PAGE_SHARED _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) 74#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
72#define PAGE_SHARED_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
73#define PAGE_COPY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
74#define PAGE_COPY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN)
75#define PAGE_READONLY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
76#define PAGE_READONLY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN)
77#define PAGE_KERNEL _MOD_PROT(pgprot_default, PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
78#define PAGE_KERNEL_EXEC _MOD_PROT(pgprot_default, PTE_UXN | PTE_DIRTY | PTE_WRITE)
79 75
80#define PAGE_HYP _MOD_PROT(pgprot_default, PTE_HYP) 76#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP)
81#define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) 77#define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
82 78
83#define PAGE_S2 __pgprot_modify(pgprot_default, PTE_S2_MEMATTR_MASK, PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) 79#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
84#define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN) 80#define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN)
85 81
86#define __PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) 82#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN)
87#define __PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) 83#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
88#define __PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) 84#define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE)
89#define __PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 85#define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
90#define __PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) 86#define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN)
91#define __PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 87#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN)
92#define __PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) 88#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN)
93 89
94#endif /* __ASSEMBLY__ */ 90#define __P000 PAGE_NONE
95 91#define __P001 PAGE_READONLY
96#define __P000 __PAGE_NONE 92#define __P010 PAGE_COPY
97#define __P001 __PAGE_READONLY 93#define __P011 PAGE_COPY
98#define __P010 __PAGE_COPY 94#define __P100 PAGE_READONLY_EXEC
99#define __P011 __PAGE_COPY 95#define __P101 PAGE_READONLY_EXEC
100#define __P100 __PAGE_READONLY_EXEC 96#define __P110 PAGE_COPY_EXEC
101#define __P101 __PAGE_READONLY_EXEC 97#define __P111 PAGE_COPY_EXEC
102#define __P110 __PAGE_COPY_EXEC 98
103#define __P111 __PAGE_COPY_EXEC 99#define __S000 PAGE_NONE
104 100#define __S001 PAGE_READONLY
105#define __S000 __PAGE_NONE 101#define __S010 PAGE_SHARED
106#define __S001 __PAGE_READONLY 102#define __S011 PAGE_SHARED
107#define __S010 __PAGE_SHARED 103#define __S100 PAGE_READONLY_EXEC
108#define __S011 __PAGE_SHARED 104#define __S101 PAGE_READONLY_EXEC
109#define __S100 __PAGE_READONLY_EXEC 105#define __S110 PAGE_SHARED_EXEC
110#define __S101 __PAGE_READONLY_EXEC 106#define __S111 PAGE_SHARED_EXEC
111#define __S110 __PAGE_SHARED_EXEC
112#define __S111 __PAGE_SHARED_EXEC
113 107
114#ifndef __ASSEMBLY__
115/* 108/*
116 * ZERO_PAGE is a global shared page that is always zero: used 109 * ZERO_PAGE is a global shared page that is always zero: used
117 * for zero-mapped memory areas etc.. 110 * for zero-mapped memory areas etc..
@@ -265,6 +258,7 @@ static inline pmd_t pte_pmd(pte_t pte)
265#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) 258#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
266 259
267#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) 260#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
261#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
268 262
269#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) 263#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
270 264
@@ -273,6 +267,9 @@ static inline int has_transparent_hugepage(void)
273 return 1; 267 return 1;
274} 268}
275 269
270#define __pgprot_modify(prot,mask,bits) \
271 __pgprot((pgprot_val(prot) & ~(mask)) | (bits))
272
276/* 273/*
277 * Mark the prot value as uncacheable and unbufferable. 274 * Mark the prot value as uncacheable and unbufferable.
278 */ 275 */
@@ -295,11 +292,17 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
295#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ 292#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
296 PMD_TYPE_SECT) 293 PMD_TYPE_SECT)
297 294
295#ifdef ARM64_64K_PAGES
296#define pud_sect(pud) (0)
297#else
298#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
299 PUD_TYPE_SECT)
300#endif
298 301
299static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 302static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
300{ 303{
301 *pmdp = pmd; 304 *pmdp = pmd;
302 dsb(); 305 dsb(ishst);
303} 306}
304 307
305static inline void pmd_clear(pmd_t *pmdp) 308static inline void pmd_clear(pmd_t *pmdp)
@@ -329,7 +332,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
329static inline void set_pud(pud_t *pudp, pud_t pud) 332static inline void set_pud(pud_t *pudp, pud_t pud)
330{ 333{
331 *pudp = pud; 334 *pudp = pud;
332 dsb(); 335 dsb(ishst);
333} 336}
334 337
335static inline void pud_clear(pud_t *pudp) 338static inline void pud_clear(pud_t *pudp)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 45b20cd6cbca..34de2a8f7d93 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -79,6 +79,7 @@ struct thread_struct {
79 unsigned long tp_value; 79 unsigned long tp_value;
80 struct fpsimd_state fpsimd_state; 80 struct fpsimd_state fpsimd_state;
81 unsigned long fault_address; /* fault info */ 81 unsigned long fault_address; /* fault info */
82 unsigned long fault_code; /* ESR_EL1 value */
82 struct debug_info debug; /* debugging */ 83 struct debug_info debug; /* debugging */
83}; 84};
84 85
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index c7ba261dd4b3..a429b5940be2 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -135,6 +135,11 @@ struct pt_regs {
135#define user_stack_pointer(regs) \ 135#define user_stack_pointer(regs) \
136 (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp) 136 (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp)
137 137
138static inline unsigned long regs_return_value(struct pt_regs *regs)
139{
140 return regs->regs[0];
141}
142
138/* 143/*
139 * Are the current registers suitable for user mode? (used to maintain 144 * Are the current registers suitable for user mode? (used to maintain
140 * security in signal handlers) 145 * security in signal handlers)
diff --git a/arch/arm64/include/asm/sigcontext.h b/arch/arm64/include/asm/sigcontext.h
deleted file mode 100644
index dca1094acc74..000000000000
--- a/arch/arm64/include/asm/sigcontext.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/*
2 * Copyright (C) 2012 ARM Ltd.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16#ifndef __ASM_SIGCONTEXT_H
17#define __ASM_SIGCONTEXT_H
18
19#include <uapi/asm/sigcontext.h>
20
21/*
22 * Auxiliary context saved in the sigcontext.__reserved array. Not exported to
23 * user space as it will change with the addition of new context. User space
24 * should check the magic/size information.
25 */
26struct aux_context {
27 struct fpsimd_context fpsimd;
28 /* additional context to be added before "end" */
29 struct _aarch64_ctx end;
30};
31#endif
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3ee8b303d9a9..64d2d4884a9d 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -22,6 +22,18 @@ extern char *strrchr(const char *, int c);
22#define __HAVE_ARCH_STRCHR 22#define __HAVE_ARCH_STRCHR
23extern char *strchr(const char *, int c); 23extern char *strchr(const char *, int c);
24 24
25#define __HAVE_ARCH_STRCMP
26extern int strcmp(const char *, const char *);
27
28#define __HAVE_ARCH_STRNCMP
29extern int strncmp(const char *, const char *, __kernel_size_t);
30
31#define __HAVE_ARCH_STRLEN
32extern __kernel_size_t strlen(const char *);
33
34#define __HAVE_ARCH_STRNLEN
35extern __kernel_size_t strnlen(const char *, __kernel_size_t);
36
25#define __HAVE_ARCH_MEMCPY 37#define __HAVE_ARCH_MEMCPY
26extern void *memcpy(void *, const void *, __kernel_size_t); 38extern void *memcpy(void *, const void *, __kernel_size_t);
27 39
@@ -34,4 +46,7 @@ extern void *memchr(const void *, int, __kernel_size_t);
34#define __HAVE_ARCH_MEMSET 46#define __HAVE_ARCH_MEMSET
35extern void *memset(void *, int, __kernel_size_t); 47extern void *memset(void *, int, __kernel_size_t);
36 48
49#define __HAVE_ARCH_MEMCMP
50extern int memcmp(const void *, const void *, size_t);
51
37#endif 52#endif
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index 70ba9d4ee978..383771eb0b87 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -18,6 +18,7 @@
18 18
19#include <linux/err.h> 19#include <linux/err.h>
20 20
21extern const void *sys_call_table[];
21 22
22static inline int syscall_get_nr(struct task_struct *task, 23static inline int syscall_get_nr(struct task_struct *task,
23 struct pt_regs *regs) 24 struct pt_regs *regs)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 7b8e3a2a00fb..e40b6d06d515 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -91,6 +91,9 @@ static inline struct thread_info *current_thread_info(void)
91/* 91/*
92 * thread information flags: 92 * thread information flags:
93 * TIF_SYSCALL_TRACE - syscall trace active 93 * TIF_SYSCALL_TRACE - syscall trace active
94 * TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace
95 * TIF_SYSCALL_AUDIT - syscall auditing
96 * TIF_SECOMP - syscall secure computing
94 * TIF_SIGPENDING - signal pending 97 * TIF_SIGPENDING - signal pending
95 * TIF_NEED_RESCHED - rescheduling necessary 98 * TIF_NEED_RESCHED - rescheduling necessary
96 * TIF_NOTIFY_RESUME - callback before returning to user 99 * TIF_NOTIFY_RESUME - callback before returning to user
@@ -99,7 +102,11 @@ static inline struct thread_info *current_thread_info(void)
99#define TIF_SIGPENDING 0 102#define TIF_SIGPENDING 0
100#define TIF_NEED_RESCHED 1 103#define TIF_NEED_RESCHED 1
101#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 104#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
105#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
102#define TIF_SYSCALL_TRACE 8 106#define TIF_SYSCALL_TRACE 8
107#define TIF_SYSCALL_AUDIT 9
108#define TIF_SYSCALL_TRACEPOINT 10
109#define TIF_SECCOMP 11
103#define TIF_MEMDIE 18 /* is terminating due to OOM killer */ 110#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
104#define TIF_FREEZE 19 111#define TIF_FREEZE 19
105#define TIF_RESTORE_SIGMASK 20 112#define TIF_RESTORE_SIGMASK 20
@@ -110,10 +117,18 @@ static inline struct thread_info *current_thread_info(void)
110#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 117#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
111#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 118#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
112#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 119#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
120#define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
121#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
122#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
123#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
124#define _TIF_SECCOMP (1 << TIF_SECCOMP)
113#define _TIF_32BIT (1 << TIF_32BIT) 125#define _TIF_32BIT (1 << TIF_32BIT)
114 126
115#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ 127#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
116 _TIF_NOTIFY_RESUME) 128 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
129
130#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
131 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
117 132
118#endif /* __KERNEL__ */ 133#endif /* __KERNEL__ */
119#endif /* __ASM_THREAD_INFO_H */ 134#endif /* __ASM_THREAD_INFO_H */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 8b482035cfc2..b9349c4513ea 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -72,9 +72,9 @@ extern struct cpu_tlb_fns cpu_tlb;
72 */ 72 */
73static inline void flush_tlb_all(void) 73static inline void flush_tlb_all(void)
74{ 74{
75 dsb(); 75 dsb(ishst);
76 asm("tlbi vmalle1is"); 76 asm("tlbi vmalle1is");
77 dsb(); 77 dsb(ish);
78 isb(); 78 isb();
79} 79}
80 80
@@ -82,9 +82,9 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
82{ 82{
83 unsigned long asid = (unsigned long)ASID(mm) << 48; 83 unsigned long asid = (unsigned long)ASID(mm) << 48;
84 84
85 dsb(); 85 dsb(ishst);
86 asm("tlbi aside1is, %0" : : "r" (asid)); 86 asm("tlbi aside1is, %0" : : "r" (asid));
87 dsb(); 87 dsb(ish);
88} 88}
89 89
90static inline void flush_tlb_page(struct vm_area_struct *vma, 90static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -93,16 +93,36 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
93 unsigned long addr = uaddr >> 12 | 93 unsigned long addr = uaddr >> 12 |
94 ((unsigned long)ASID(vma->vm_mm) << 48); 94 ((unsigned long)ASID(vma->vm_mm) << 48);
95 95
96 dsb(); 96 dsb(ishst);
97 asm("tlbi vae1is, %0" : : "r" (addr)); 97 asm("tlbi vae1is, %0" : : "r" (addr));
98 dsb(); 98 dsb(ish);
99} 99}
100 100
101/* 101static inline void flush_tlb_range(struct vm_area_struct *vma,
102 * Convert calls to our calling convention. 102 unsigned long start, unsigned long end)
103 */ 103{
104#define flush_tlb_range(vma,start,end) __cpu_flush_user_tlb_range(start,end,vma) 104 unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48;
105#define flush_tlb_kernel_range(s,e) __cpu_flush_kern_tlb_range(s,e) 105 unsigned long addr;
106 start = asid | (start >> 12);
107 end = asid | (end >> 12);
108
109 dsb(ishst);
110 for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
111 asm("tlbi vae1is, %0" : : "r"(addr));
112 dsb(ish);
113}
114
115static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
116{
117 unsigned long addr;
118 start >>= 12;
119 end >>= 12;
120
121 dsb(ishst);
122 for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
123 asm("tlbi vaae1is, %0" : : "r"(addr));
124 dsb(ish);
125}
106 126
107/* 127/*
108 * On AArch64, the cache coherency is handled via the set_pte_at() function. 128 * On AArch64, the cache coherency is handled via the set_pte_at() function.
@@ -114,7 +134,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
114 * set_pte() does not have a DSB, so make sure that the page table 134 * set_pte() does not have a DSB, so make sure that the page table
115 * write is visible. 135 * write is visible.
116 */ 136 */
117 dsb(); 137 dsb(ishst);
118} 138}
119 139
120#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) 140#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 0172e6d76bf3..7ebcd31ce51c 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -20,9 +20,6 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 20#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling)
21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 21#define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
22 22
23#define mc_capable() (cpu_topology[0].cluster_id != -1)
24#define smt_capable() (cpu_topology[0].thread_id != -1)
25
26void init_cpu_topology(void); 23void init_cpu_topology(void);
27void store_cpu_topology(unsigned int cpuid); 24void store_cpu_topology(unsigned int cpuid);
28const struct cpumask *cpu_coregroup_mask(int cpu); 25const struct cpumask *cpu_coregroup_mask(int cpu);
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index a4654c656a1e..e5f47df00c24 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -29,3 +29,5 @@
29#endif 29#endif
30#define __ARCH_WANT_SYS_CLONE 30#define __ARCH_WANT_SYS_CLONE
31#include <uapi/asm/unistd.h> 31#include <uapi/asm/unistd.h>
32
33#define NR_syscalls (__NR_syscalls)
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 690ad51cc901..b72cf405b3fe 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -53,5 +53,12 @@ struct fpsimd_context {
53 __uint128_t vregs[32]; 53 __uint128_t vregs[32];
54}; 54};
55 55
56/* ESR_EL1 context */
57#define ESR_MAGIC 0x45535201
58
59struct esr_context {
60 struct _aarch64_ctx head;
61 u64 esr;
62};
56 63
57#endif /* _UAPI__ASM_SIGCONTEXT_H */ 64#endif /* _UAPI__ASM_SIGCONTEXT_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ba5e17a522d5..cdaedad3afe5 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -7,14 +7,19 @@ AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
7CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) \ 7CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) \
8 -I$(src)/../../../scripts/dtc/libfdt 8 -I$(src)/../../../scripts/dtc/libfdt
9 9
10CFLAGS_REMOVE_ftrace.o = -pg
11CFLAGS_REMOVE_insn.o = -pg
12CFLAGS_REMOVE_return_address.o = -pg
13
10# Object file lists. 14# Object file lists.
11arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \ 15arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \
12 entry-fpsimd.o process.o ptrace.o setup.o signal.o \ 16 entry-fpsimd.o process.o ptrace.o setup.o signal.o \
13 sys.o stacktrace.o time.o traps.o io.o vdso.o \ 17 sys.o stacktrace.o time.o traps.o io.o vdso.o \
14 hyp-stub.o psci.o cpu_ops.o insn.o 18 hyp-stub.o psci.o cpu_ops.o insn.o return_address.o
15 19
16arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ 20arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \
17 sys_compat.o 21 sys_compat.o
22arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o
18arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o 23arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o
19arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o 24arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o
20arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o 25arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..a85843ddbde8 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,10 +44,15 @@ EXPORT_SYMBOL(memstart_addr);
44 /* string / mem functions */ 44 /* string / mem functions */
45EXPORT_SYMBOL(strchr); 45EXPORT_SYMBOL(strchr);
46EXPORT_SYMBOL(strrchr); 46EXPORT_SYMBOL(strrchr);
47EXPORT_SYMBOL(strcmp);
48EXPORT_SYMBOL(strncmp);
49EXPORT_SYMBOL(strlen);
50EXPORT_SYMBOL(strnlen);
47EXPORT_SYMBOL(memset); 51EXPORT_SYMBOL(memset);
48EXPORT_SYMBOL(memcpy); 52EXPORT_SYMBOL(memcpy);
49EXPORT_SYMBOL(memmove); 53EXPORT_SYMBOL(memmove);
50EXPORT_SYMBOL(memchr); 54EXPORT_SYMBOL(memchr);
55EXPORT_SYMBOL(memcmp);
51 56
52 /* atomic bitops */ 57 /* atomic bitops */
53EXPORT_SYMBOL(set_bit); 58EXPORT_SYMBOL(set_bit);
@@ -56,3 +61,7 @@ EXPORT_SYMBOL(clear_bit);
56EXPORT_SYMBOL(test_and_clear_bit); 61EXPORT_SYMBOL(test_and_clear_bit);
57EXPORT_SYMBOL(change_bit); 62EXPORT_SYMBOL(change_bit);
58EXPORT_SYMBOL(test_and_change_bit); 63EXPORT_SYMBOL(test_and_change_bit);
64
65#ifdef CONFIG_FUNCTION_TRACER
66EXPORT_SYMBOL(_mcount);
67#endif
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6dbfa6..d358ccacfc00 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
41 fpsimd_restore x0, 8 41 fpsimd_restore x0, 8
42 ret 42 ret
43ENDPROC(fpsimd_load_state) 43ENDPROC(fpsimd_load_state)
44
45#ifdef CONFIG_KERNEL_MODE_NEON
46
47/*
48 * Save the bottom n FP registers.
49 *
50 * x0 - pointer to struct fpsimd_partial_state
51 */
52ENTRY(fpsimd_save_partial_state)
53 fpsimd_save_partial x0, 1, 8, 9
54 ret
55ENDPROC(fpsimd_load_partial_state)
56
57/*
58 * Load the bottom n FP registers.
59 *
60 * x0 - pointer to struct fpsimd_partial_state
61 */
62ENTRY(fpsimd_load_partial_state)
63 fpsimd_restore_partial x0, 8, 9
64 ret
65ENDPROC(fpsimd_load_partial_state)
66
67#endif
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
new file mode 100644
index 000000000000..b051871f2965
--- /dev/null
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -0,0 +1,218 @@
1/*
2 * arch/arm64/kernel/entry-ftrace.S
3 *
4 * Copyright (C) 2013 Linaro Limited
5 * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/ftrace.h>
14#include <asm/insn.h>
15
16/*
17 * Gcc with -pg will put the following code in the beginning of each function:
18 * mov x0, x30
19 * bl _mcount
20 * [function's body ...]
21 * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic
22 * ftrace is enabled.
23 *
24 * Please note that x0 as an argument will not be used here because we can
25 * get lr(x30) of instrumented function at any time by winding up call stack
26 * as long as the kernel is compiled without -fomit-frame-pointer.
27 * (or CONFIG_FRAME_POINTER, this is forced on arm64)
28 *
29 * stack layout after mcount_enter in _mcount():
30 *
31 * current sp/fp => 0:+-----+
32 * in _mcount() | x29 | -> instrumented function's fp
33 * +-----+
34 * | x30 | -> _mcount()'s lr (= instrumented function's pc)
35 * old sp => +16:+-----+
36 * when instrumented | |
37 * function calls | ... |
38 * _mcount() | |
39 * | |
40 * instrumented => +xx:+-----+
41 * function's fp | x29 | -> parent's fp
42 * +-----+
43 * | x30 | -> instrumented function's lr (= parent's pc)
44 * +-----+
45 * | ... |
46 */
47
48 .macro mcount_enter
49 stp x29, x30, [sp, #-16]!
50 mov x29, sp
51 .endm
52
53 .macro mcount_exit
54 ldp x29, x30, [sp], #16
55 ret
56 .endm
57
58 .macro mcount_adjust_addr rd, rn
59 sub \rd, \rn, #AARCH64_INSN_SIZE
60 .endm
61
62 /* for instrumented function's parent */
63 .macro mcount_get_parent_fp reg
64 ldr \reg, [x29]
65 ldr \reg, [\reg]
66 .endm
67
68 /* for instrumented function */
69 .macro mcount_get_pc0 reg
70 mcount_adjust_addr \reg, x30
71 .endm
72
73 .macro mcount_get_pc reg
74 ldr \reg, [x29, #8]
75 mcount_adjust_addr \reg, \reg
76 .endm
77
78 .macro mcount_get_lr reg
79 ldr \reg, [x29]
80 ldr \reg, [\reg, #8]
81 mcount_adjust_addr \reg, \reg
82 .endm
83
84 .macro mcount_get_lr_addr reg
85 ldr \reg, [x29]
86 add \reg, \reg, #8
87 .endm
88
89#ifndef CONFIG_DYNAMIC_FTRACE
90/*
91 * void _mcount(unsigned long return_address)
92 * @return_address: return address to instrumented function
93 *
94 * This function makes calls, if enabled, to:
95 * - tracer function to probe instrumented function's entry,
96 * - ftrace_graph_caller to set up an exit hook
97 */
98ENTRY(_mcount)
99#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
100 ldr x0, =ftrace_trace_stop
101 ldr x0, [x0] // if ftrace_trace_stop
102 ret // return;
103#endif
104 mcount_enter
105
106 ldr x0, =ftrace_trace_function
107 ldr x2, [x0]
108 adr x0, ftrace_stub
109 cmp x0, x2 // if (ftrace_trace_function
110 b.eq skip_ftrace_call // != ftrace_stub) {
111
112 mcount_get_pc x0 // function's pc
113 mcount_get_lr x1 // function's lr (= parent's pc)
114 blr x2 // (*ftrace_trace_function)(pc, lr);
115
116#ifndef CONFIG_FUNCTION_GRAPH_TRACER
117skip_ftrace_call: // return;
118 mcount_exit // }
119#else
120 mcount_exit // return;
121 // }
122skip_ftrace_call:
123 ldr x1, =ftrace_graph_return
124 ldr x2, [x1] // if ((ftrace_graph_return
125 cmp x0, x2 // != ftrace_stub)
126 b.ne ftrace_graph_caller
127
128 ldr x1, =ftrace_graph_entry // || (ftrace_graph_entry
129 ldr x2, [x1] // != ftrace_graph_entry_stub))
130 ldr x0, =ftrace_graph_entry_stub
131 cmp x0, x2
132 b.ne ftrace_graph_caller // ftrace_graph_caller();
133
134 mcount_exit
135#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
136ENDPROC(_mcount)
137
138#else /* CONFIG_DYNAMIC_FTRACE */
139/*
140 * _mcount() is used to build the kernel with -pg option, but all the branch
141 * instructions to _mcount() are replaced to NOP initially at kernel start up,
142 * and later on, NOP to branch to ftrace_caller() when enabled or branch to
143 * NOP when disabled per-function base.
144 */
145ENTRY(_mcount)
146 ret
147ENDPROC(_mcount)
148
149/*
150 * void ftrace_caller(unsigned long return_address)
151 * @return_address: return address to instrumented function
152 *
153 * This function is a counterpart of _mcount() in 'static' ftrace, and
154 * makes calls to:
155 * - tracer function to probe instrumented function's entry,
156 * - ftrace_graph_caller to set up an exit hook
157 */
158ENTRY(ftrace_caller)
159 mcount_enter
160
161 mcount_get_pc0 x0 // function's pc
162 mcount_get_lr x1 // function's lr
163
164 .global ftrace_call
165ftrace_call: // tracer(pc, lr);
166 nop // This will be replaced with "bl xxx"
167 // where xxx can be any kind of tracer.
168
169#ifdef CONFIG_FUNCTION_GRAPH_TRACER
170 .global ftrace_graph_call
171ftrace_graph_call: // ftrace_graph_caller();
172 nop // If enabled, this will be replaced
173 // "b ftrace_graph_caller"
174#endif
175
176 mcount_exit
177ENDPROC(ftrace_caller)
178#endif /* CONFIG_DYNAMIC_FTRACE */
179
180ENTRY(ftrace_stub)
181 ret
182ENDPROC(ftrace_stub)
183
184#ifdef CONFIG_FUNCTION_GRAPH_TRACER
185/*
186 * void ftrace_graph_caller(void)
187 *
188 * Called from _mcount() or ftrace_caller() when function_graph tracer is
189 * selected.
190 * This function w/ prepare_ftrace_return() fakes link register's value on
191 * the call stack in order to intercept instrumented function's return path
192 * and run return_to_handler() later on its exit.
193 */
194ENTRY(ftrace_graph_caller)
195 mcount_get_lr_addr x0 // pointer to function's saved lr
196 mcount_get_pc x1 // function's pc
197 mcount_get_parent_fp x2 // parent's fp
198 bl prepare_ftrace_return // prepare_ftrace_return(&lr, pc, fp)
199
200 mcount_exit
201ENDPROC(ftrace_graph_caller)
202
203/*
204 * void return_to_handler(void)
205 *
206 * Run ftrace_return_to_handler() before going back to parent.
207 * @fp is checked against the value passed by ftrace_graph_caller()
208 * only when CONFIG_FUNCTION_GRAPH_FP_TEST is enabled.
209 */
210ENTRY(return_to_handler)
211 str x0, [sp, #-16]!
212 mov x0, x29 // parent's fp
213 bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp);
214 mov x30, x0 // restore the original return address
215 ldr x0, [sp], #16
216 ret
217END(return_to_handler)
218#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 39ac630d83de..bf017f4ffb4f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -60,6 +60,9 @@
60 push x0, x1 60 push x0, x1
61 .if \el == 0 61 .if \el == 0
62 mrs x21, sp_el0 62 mrs x21, sp_el0
63 get_thread_info tsk // Ensure MDSCR_EL1.SS is clear,
64 ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug
65 disable_step_tsk x19, x20 // exceptions when scheduling.
63 .else 66 .else
64 add x21, sp, #S_FRAME_SIZE 67 add x21, sp, #S_FRAME_SIZE
65 .endif 68 .endif
@@ -259,7 +262,7 @@ el1_da:
259 * Data abort handling 262 * Data abort handling
260 */ 263 */
261 mrs x0, far_el1 264 mrs x0, far_el1
262 enable_dbg_if_not_stepping x2 265 enable_dbg
263 // re-enable interrupts if they were enabled in the aborted context 266 // re-enable interrupts if they were enabled in the aborted context
264 tbnz x23, #7, 1f // PSR_I_BIT 267 tbnz x23, #7, 1f // PSR_I_BIT
265 enable_irq 268 enable_irq
@@ -275,6 +278,7 @@ el1_sp_pc:
275 * Stack or PC alignment exception handling 278 * Stack or PC alignment exception handling
276 */ 279 */
277 mrs x0, far_el1 280 mrs x0, far_el1
281 enable_dbg
278 mov x1, x25 282 mov x1, x25
279 mov x2, sp 283 mov x2, sp
280 b do_sp_pc_abort 284 b do_sp_pc_abort
@@ -282,6 +286,7 @@ el1_undef:
282 /* 286 /*
283 * Undefined instruction 287 * Undefined instruction
284 */ 288 */
289 enable_dbg
285 mov x0, sp 290 mov x0, sp
286 b do_undefinstr 291 b do_undefinstr
287el1_dbg: 292el1_dbg:
@@ -294,10 +299,11 @@ el1_dbg:
294 mrs x0, far_el1 299 mrs x0, far_el1
295 mov x2, sp // struct pt_regs 300 mov x2, sp // struct pt_regs
296 bl do_debug_exception 301 bl do_debug_exception
297 302 enable_dbg
298 kernel_exit 1 303 kernel_exit 1
299el1_inv: 304el1_inv:
300 // TODO: add support for undefined instructions in kernel mode 305 // TODO: add support for undefined instructions in kernel mode
306 enable_dbg
301 mov x0, sp 307 mov x0, sp
302 mov x1, #BAD_SYNC 308 mov x1, #BAD_SYNC
303 mrs x2, esr_el1 309 mrs x2, esr_el1
@@ -307,7 +313,7 @@ ENDPROC(el1_sync)
307 .align 6 313 .align 6
308el1_irq: 314el1_irq:
309 kernel_entry 1 315 kernel_entry 1
310 enable_dbg_if_not_stepping x0 316 enable_dbg
311#ifdef CONFIG_TRACE_IRQFLAGS 317#ifdef CONFIG_TRACE_IRQFLAGS
312 bl trace_hardirqs_off 318 bl trace_hardirqs_off
313#endif 319#endif
@@ -332,8 +338,7 @@ ENDPROC(el1_irq)
332#ifdef CONFIG_PREEMPT 338#ifdef CONFIG_PREEMPT
333el1_preempt: 339el1_preempt:
334 mov x24, lr 340 mov x24, lr
3351: enable_dbg 3411: bl preempt_schedule_irq // irq en/disable is done inside
336 bl preempt_schedule_irq // irq en/disable is done inside
337 ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS 342 ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS
338 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? 343 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
339 ret x24 344 ret x24
@@ -349,7 +354,7 @@ el0_sync:
349 lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class 354 lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class
350 cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state 355 cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state
351 b.eq el0_svc 356 b.eq el0_svc
352 adr lr, ret_from_exception 357 adr lr, ret_to_user
353 cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 358 cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0
354 b.eq el0_da 359 b.eq el0_da
355 cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 360 cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0
@@ -378,7 +383,7 @@ el0_sync_compat:
378 lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class 383 lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class
379 cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state 384 cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state
380 b.eq el0_svc_compat 385 b.eq el0_svc_compat
381 adr lr, ret_from_exception 386 adr lr, ret_to_user
382 cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 387 cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0
383 b.eq el0_da 388 b.eq el0_da
384 cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 389 cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0
@@ -423,11 +428,8 @@ el0_da:
423 */ 428 */
424 mrs x0, far_el1 429 mrs x0, far_el1
425 bic x0, x0, #(0xff << 56) 430 bic x0, x0, #(0xff << 56)
426 disable_step x1
427 isb
428 enable_dbg
429 // enable interrupts before calling the main handler 431 // enable interrupts before calling the main handler
430 enable_irq 432 enable_dbg_and_irq
431 mov x1, x25 433 mov x1, x25
432 mov x2, sp 434 mov x2, sp
433 b do_mem_abort 435 b do_mem_abort
@@ -436,11 +438,8 @@ el0_ia:
436 * Instruction abort handling 438 * Instruction abort handling
437 */ 439 */
438 mrs x0, far_el1 440 mrs x0, far_el1
439 disable_step x1
440 isb
441 enable_dbg
442 // enable interrupts before calling the main handler 441 // enable interrupts before calling the main handler
443 enable_irq 442 enable_dbg_and_irq
444 orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts 443 orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts
445 mov x2, sp 444 mov x2, sp
446 b do_mem_abort 445 b do_mem_abort
@@ -448,6 +447,7 @@ el0_fpsimd_acc:
448 /* 447 /*
449 * Floating Point or Advanced SIMD access 448 * Floating Point or Advanced SIMD access
450 */ 449 */
450 enable_dbg
451 mov x0, x25 451 mov x0, x25
452 mov x1, sp 452 mov x1, sp
453 b do_fpsimd_acc 453 b do_fpsimd_acc
@@ -455,6 +455,7 @@ el0_fpsimd_exc:
455 /* 455 /*
456 * Floating Point or Advanced SIMD exception 456 * Floating Point or Advanced SIMD exception
457 */ 457 */
458 enable_dbg
458 mov x0, x25 459 mov x0, x25
459 mov x1, sp 460 mov x1, sp
460 b do_fpsimd_exc 461 b do_fpsimd_exc
@@ -463,11 +464,8 @@ el0_sp_pc:
463 * Stack or PC alignment exception handling 464 * Stack or PC alignment exception handling
464 */ 465 */
465 mrs x0, far_el1 466 mrs x0, far_el1
466 disable_step x1
467 isb
468 enable_dbg
469 // enable interrupts before calling the main handler 467 // enable interrupts before calling the main handler
470 enable_irq 468 enable_dbg_and_irq
471 mov x1, x25 469 mov x1, x25
472 mov x2, sp 470 mov x2, sp
473 b do_sp_pc_abort 471 b do_sp_pc_abort
@@ -475,9 +473,9 @@ el0_undef:
475 /* 473 /*
476 * Undefined instruction 474 * Undefined instruction
477 */ 475 */
478 mov x0, sp
479 // enable interrupts before calling the main handler 476 // enable interrupts before calling the main handler
480 enable_irq 477 enable_dbg_and_irq
478 mov x0, sp
481 b do_undefinstr 479 b do_undefinstr
482el0_dbg: 480el0_dbg:
483 /* 481 /*
@@ -485,11 +483,13 @@ el0_dbg:
485 */ 483 */
486 tbnz x24, #0, el0_inv // EL0 only 484 tbnz x24, #0, el0_inv // EL0 only
487 mrs x0, far_el1 485 mrs x0, far_el1
488 disable_step x1
489 mov x1, x25 486 mov x1, x25
490 mov x2, sp 487 mov x2, sp
491 b do_debug_exception 488 bl do_debug_exception
489 enable_dbg
490 b ret_to_user
492el0_inv: 491el0_inv:
492 enable_dbg
493 mov x0, sp 493 mov x0, sp
494 mov x1, #BAD_SYNC 494 mov x1, #BAD_SYNC
495 mrs x2, esr_el1 495 mrs x2, esr_el1
@@ -500,15 +500,12 @@ ENDPROC(el0_sync)
500el0_irq: 500el0_irq:
501 kernel_entry 0 501 kernel_entry 0
502el0_irq_naked: 502el0_irq_naked:
503 disable_step x1
504 isb
505 enable_dbg 503 enable_dbg
506#ifdef CONFIG_TRACE_IRQFLAGS 504#ifdef CONFIG_TRACE_IRQFLAGS
507 bl trace_hardirqs_off 505 bl trace_hardirqs_off
508#endif 506#endif
509 507
510 irq_handler 508 irq_handler
511 get_thread_info tsk
512 509
513#ifdef CONFIG_TRACE_IRQFLAGS 510#ifdef CONFIG_TRACE_IRQFLAGS
514 bl trace_hardirqs_on 511 bl trace_hardirqs_on
@@ -517,14 +514,6 @@ el0_irq_naked:
517ENDPROC(el0_irq) 514ENDPROC(el0_irq)
518 515
519/* 516/*
520 * This is the return code to user mode for abort handlers
521 */
522ret_from_exception:
523 get_thread_info tsk
524 b ret_to_user
525ENDPROC(ret_from_exception)
526
527/*
528 * Register switch for AArch64. The callee-saved registers need to be saved 517 * Register switch for AArch64. The callee-saved registers need to be saved
529 * and restored. On entry: 518 * and restored. On entry:
530 * x0 = previous task_struct (must be preserved across the switch) 519 * x0 = previous task_struct (must be preserved across the switch)
@@ -563,10 +552,7 @@ ret_fast_syscall:
563 ldr x1, [tsk, #TI_FLAGS] 552 ldr x1, [tsk, #TI_FLAGS]
564 and x2, x1, #_TIF_WORK_MASK 553 and x2, x1, #_TIF_WORK_MASK
565 cbnz x2, fast_work_pending 554 cbnz x2, fast_work_pending
566 tbz x1, #TIF_SINGLESTEP, fast_exit 555 enable_step_tsk x1, x2
567 disable_dbg
568 enable_step x2
569fast_exit:
570 kernel_exit 0, ret = 1 556 kernel_exit 0, ret = 1
571 557
572/* 558/*
@@ -576,7 +562,7 @@ fast_work_pending:
576 str x0, [sp, #S_X0] // returned x0 562 str x0, [sp, #S_X0] // returned x0
577work_pending: 563work_pending:
578 tbnz x1, #TIF_NEED_RESCHED, work_resched 564 tbnz x1, #TIF_NEED_RESCHED, work_resched
579 /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ 565 /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
580 ldr x2, [sp, #S_PSTATE] 566 ldr x2, [sp, #S_PSTATE]
581 mov x0, sp // 'regs' 567 mov x0, sp // 'regs'
582 tst x2, #PSR_MODE_MASK // user mode regs? 568 tst x2, #PSR_MODE_MASK // user mode regs?
@@ -585,7 +571,6 @@ work_pending:
585 bl do_notify_resume 571 bl do_notify_resume
586 b ret_to_user 572 b ret_to_user
587work_resched: 573work_resched:
588 enable_dbg
589 bl schedule 574 bl schedule
590 575
591/* 576/*
@@ -596,9 +581,7 @@ ret_to_user:
596 ldr x1, [tsk, #TI_FLAGS] 581 ldr x1, [tsk, #TI_FLAGS]
597 and x2, x1, #_TIF_WORK_MASK 582 and x2, x1, #_TIF_WORK_MASK
598 cbnz x2, work_pending 583 cbnz x2, work_pending
599 tbz x1, #TIF_SINGLESTEP, no_work_pending 584 enable_step_tsk x1, x2
600 disable_dbg
601 enable_step x2
602no_work_pending: 585no_work_pending:
603 kernel_exit 0, ret = 0 586 kernel_exit 0, ret = 0
604ENDPROC(ret_to_user) 587ENDPROC(ret_to_user)
@@ -625,14 +608,11 @@ el0_svc:
625 mov sc_nr, #__NR_syscalls 608 mov sc_nr, #__NR_syscalls
626el0_svc_naked: // compat entry point 609el0_svc_naked: // compat entry point
627 stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number 610 stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number
628 disable_step x16 611 enable_dbg_and_irq
629 isb
630 enable_dbg
631 enable_irq
632 612
633 get_thread_info tsk 613 ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks
634 ldr x16, [tsk, #TI_FLAGS] // check for syscall tracing 614 tst x16, #_TIF_SYSCALL_WORK
635 tbnz x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls? 615 b.ne __sys_trace
636 adr lr, ret_fast_syscall // return address 616 adr lr, ret_fast_syscall // return address
637 cmp scno, sc_nr // check upper syscall limit 617 cmp scno, sc_nr // check upper syscall limit
638 b.hs ni_sys 618 b.hs ni_sys
@@ -648,9 +628,8 @@ ENDPROC(el0_svc)
648 * switches, and waiting for our parent to respond. 628 * switches, and waiting for our parent to respond.
649 */ 629 */
650__sys_trace: 630__sys_trace:
651 mov x1, sp 631 mov x0, sp
652 mov w0, #0 // trace entry 632 bl syscall_trace_enter
653 bl syscall_trace
654 adr lr, __sys_trace_return // return address 633 adr lr, __sys_trace_return // return address
655 uxtw scno, w0 // syscall number (possibly new) 634 uxtw scno, w0 // syscall number (possibly new)
656 mov x1, sp // pointer to regs 635 mov x1, sp // pointer to regs
@@ -665,9 +644,8 @@ __sys_trace:
665 644
666__sys_trace_return: 645__sys_trace_return:
667 str x0, [sp] // save returned x0 646 str x0, [sp] // save returned x0
668 mov x1, sp 647 mov x0, sp
669 mov w0, #1 // trace exit 648 bl syscall_trace_exit
670 bl syscall_trace
671 b ret_to_user 649 b ret_to_user
672 650
673/* 651/*
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4aef42a04bdc..ad8aebb1cdef 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -35,6 +35,60 @@
35#define FPEXC_IDF (1 << 7) 35#define FPEXC_IDF (1 << 7)
36 36
37/* 37/*
38 * In order to reduce the number of times the FPSIMD state is needlessly saved
39 * and restored, we need to keep track of two things:
40 * (a) for each task, we need to remember which CPU was the last one to have
41 * the task's FPSIMD state loaded into its FPSIMD registers;
42 * (b) for each CPU, we need to remember which task's userland FPSIMD state has
43 * been loaded into its FPSIMD registers most recently, or whether it has
44 * been used to perform kernel mode NEON in the meantime.
45 *
46 * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
47 * the id of the current CPU everytime the state is loaded onto a CPU. For (b),
48 * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
49 * address of the userland FPSIMD state of the task that was loaded onto the CPU
50 * the most recently, or NULL if kernel mode NEON has been performed after that.
51 *
52 * With this in place, we no longer have to restore the next FPSIMD state right
53 * when switching between tasks. Instead, we can defer this check to userland
54 * resume, at which time we verify whether the CPU's fpsimd_last_state and the
55 * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
56 * can omit the FPSIMD restore.
57 *
58 * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
59 * indicate whether or not the userland FPSIMD state of the current task is
60 * present in the registers. The flag is set unless the FPSIMD registers of this
61 * CPU currently contain the most recent userland FPSIMD state of the current
62 * task.
63 *
64 * For a certain task, the sequence may look something like this:
65 * - the task gets scheduled in; if both the task's fpsimd_state.cpu field
66 * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
67 * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
68 * cleared, otherwise it is set;
69 *
70 * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
71 * userland FPSIMD state is copied from memory to the registers, the task's
72 * fpsimd_state.cpu field is set to the id of the current CPU, the current
73 * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
74 * TIF_FOREIGN_FPSTATE flag is cleared;
75 *
76 * - the task executes an ordinary syscall; upon return to userland, the
77 * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
78 * restored;
79 *
80 * - the task executes a syscall which executes some NEON instructions; this is
81 * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
82 * register contents to memory, clears the fpsimd_last_state per-cpu variable
83 * and sets the TIF_FOREIGN_FPSTATE flag;
84 *
85 * - the task gets preempted after kernel_neon_end() is called; as we have not
86 * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
87 * whatever is in the FPSIMD registers is not saved to memory, but discarded.
88 */
89static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
90
91/*
38 * Trapped FP/ASIMD access. 92 * Trapped FP/ASIMD access.
39 */ 93 */
40void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) 94void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
72 126
73void fpsimd_thread_switch(struct task_struct *next) 127void fpsimd_thread_switch(struct task_struct *next)
74{ 128{
75 /* check if not kernel threads */ 129 /*
76 if (current->mm) 130 * Save the current FPSIMD state to memory, but only if whatever is in
131 * the registers is in fact the most recent userland FPSIMD state of
132 * 'current'.
133 */
134 if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
77 fpsimd_save_state(&current->thread.fpsimd_state); 135 fpsimd_save_state(&current->thread.fpsimd_state);
78 if (next->mm) 136
79 fpsimd_load_state(&next->thread.fpsimd_state); 137 if (next->mm) {
138 /*
139 * If we are switching to a task whose most recent userland
140 * FPSIMD state is already in the registers of *this* cpu,
141 * we can skip loading the state from memory. Otherwise, set
142 * the TIF_FOREIGN_FPSTATE flag so the state will be loaded
143 * upon the next return to userland.
144 */
145 struct fpsimd_state *st = &next->thread.fpsimd_state;
146
147 if (__this_cpu_read(fpsimd_last_state) == st
148 && st->cpu == smp_processor_id())
149 clear_ti_thread_flag(task_thread_info(next),
150 TIF_FOREIGN_FPSTATE);
151 else
152 set_ti_thread_flag(task_thread_info(next),
153 TIF_FOREIGN_FPSTATE);
154 }
80} 155}
81 156
82void fpsimd_flush_thread(void) 157void fpsimd_flush_thread(void)
83{ 158{
84 preempt_disable();
85 memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); 159 memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
86 fpsimd_load_state(&current->thread.fpsimd_state); 160 set_thread_flag(TIF_FOREIGN_FPSTATE);
161}
162
163/*
164 * Save the userland FPSIMD state of 'current' to memory, but only if the state
165 * currently held in the registers does in fact belong to 'current'
166 */
167void fpsimd_preserve_current_state(void)
168{
169 preempt_disable();
170 if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
171 fpsimd_save_state(&current->thread.fpsimd_state);
172 preempt_enable();
173}
174
175/*
176 * Load the userland FPSIMD state of 'current' from memory, but only if the
177 * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
178 * state of 'current'
179 */
180void fpsimd_restore_current_state(void)
181{
182 preempt_disable();
183 if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
184 struct fpsimd_state *st = &current->thread.fpsimd_state;
185
186 fpsimd_load_state(st);
187 this_cpu_write(fpsimd_last_state, st);
188 st->cpu = smp_processor_id();
189 }
190 preempt_enable();
191}
192
193/*
194 * Load an updated userland FPSIMD state for 'current' from memory and set the
195 * flag that indicates that the FPSIMD register contents are the most recent
196 * FPSIMD state of 'current'
197 */
198void fpsimd_update_current_state(struct fpsimd_state *state)
199{
200 preempt_disable();
201 fpsimd_load_state(state);
202 if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
203 struct fpsimd_state *st = &current->thread.fpsimd_state;
204
205 this_cpu_write(fpsimd_last_state, st);
206 st->cpu = smp_processor_id();
207 }
87 preempt_enable(); 208 preempt_enable();
88} 209}
89 210
211/*
212 * Invalidate live CPU copies of task t's FPSIMD state
213 */
214void fpsimd_flush_task_state(struct task_struct *t)
215{
216 t->thread.fpsimd_state.cpu = NR_CPUS;
217}
218
90#ifdef CONFIG_KERNEL_MODE_NEON 219#ifdef CONFIG_KERNEL_MODE_NEON
91 220
221static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
222static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
223
92/* 224/*
93 * Kernel-side NEON support functions 225 * Kernel-side NEON support functions
94 */ 226 */
95void kernel_neon_begin(void) 227void kernel_neon_begin_partial(u32 num_regs)
96{ 228{
97 /* Avoid using the NEON in interrupt context */ 229 if (in_interrupt()) {
98 BUG_ON(in_interrupt()); 230 struct fpsimd_partial_state *s = this_cpu_ptr(
99 preempt_disable(); 231 in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
100 232
101 if (current->mm) 233 BUG_ON(num_regs > 32);
102 fpsimd_save_state(&current->thread.fpsimd_state); 234 fpsimd_save_partial_state(s, roundup(num_regs, 2));
235 } else {
236 /*
237 * Save the userland FPSIMD state if we have one and if we
238 * haven't done so already. Clear fpsimd_last_state to indicate
239 * that there is no longer userland FPSIMD state in the
240 * registers.
241 */
242 preempt_disable();
243 if (current->mm &&
244 !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
245 fpsimd_save_state(&current->thread.fpsimd_state);
246 this_cpu_write(fpsimd_last_state, NULL);
247 }
103} 248}
104EXPORT_SYMBOL(kernel_neon_begin); 249EXPORT_SYMBOL(kernel_neon_begin_partial);
105 250
106void kernel_neon_end(void) 251void kernel_neon_end(void)
107{ 252{
108 if (current->mm) 253 if (in_interrupt()) {
109 fpsimd_load_state(&current->thread.fpsimd_state); 254 struct fpsimd_partial_state *s = this_cpu_ptr(
110 255 in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
111 preempt_enable(); 256 fpsimd_load_partial_state(s);
257 } else {
258 preempt_enable();
259 }
112} 260}
113EXPORT_SYMBOL(kernel_neon_end); 261EXPORT_SYMBOL(kernel_neon_end);
114 262
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
120{ 268{
121 switch (cmd) { 269 switch (cmd) {
122 case CPU_PM_ENTER: 270 case CPU_PM_ENTER:
123 if (current->mm) 271 if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
124 fpsimd_save_state(&current->thread.fpsimd_state); 272 fpsimd_save_state(&current->thread.fpsimd_state);
125 break; 273 break;
126 case CPU_PM_EXIT: 274 case CPU_PM_EXIT:
127 if (current->mm) 275 if (current->mm)
128 fpsimd_load_state(&current->thread.fpsimd_state); 276 set_thread_flag(TIF_FOREIGN_FPSTATE);
129 break; 277 break;
130 case CPU_PM_ENTER_FAILED: 278 case CPU_PM_ENTER_FAILED:
131 default: 279 default:
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
new file mode 100644
index 000000000000..7924d73b6476
--- /dev/null
+++ b/arch/arm64/kernel/ftrace.c
@@ -0,0 +1,176 @@
1/*
2 * arch/arm64/kernel/ftrace.c
3 *
4 * Copyright (C) 2013 Linaro Limited
5 * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/ftrace.h>
13#include <linux/swab.h>
14#include <linux/uaccess.h>
15
16#include <asm/cacheflush.h>
17#include <asm/ftrace.h>
18#include <asm/insn.h>
19
20#ifdef CONFIG_DYNAMIC_FTRACE
21/*
22 * Replace a single instruction, which may be a branch or NOP.
23 * If @validate == true, a replaced instruction is checked against 'old'.
24 */
25static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
26 bool validate)
27{
28 u32 replaced;
29
30 /*
31 * Note:
32 * Due to modules and __init, code can disappear and change,
33 * we need to protect against faulting as well as code changing.
34 * We do this by aarch64_insn_*() which use the probe_kernel_*().
35 *
36 * No lock is held here because all the modifications are run
37 * through stop_machine().
38 */
39 if (validate) {
40 if (aarch64_insn_read((void *)pc, &replaced))
41 return -EFAULT;
42
43 if (replaced != old)
44 return -EINVAL;
45 }
46 if (aarch64_insn_patch_text_nosync((void *)pc, new))
47 return -EPERM;
48
49 return 0;
50}
51
52/*
53 * Replace tracer function in ftrace_caller()
54 */
55int ftrace_update_ftrace_func(ftrace_func_t func)
56{
57 unsigned long pc;
58 u32 new;
59
60 pc = (unsigned long)&ftrace_call;
61 new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, true);
62
63 return ftrace_modify_code(pc, 0, new, false);
64}
65
66/*
67 * Turn on the call to ftrace_caller() in instrumented function
68 */
69int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
70{
71 unsigned long pc = rec->ip;
72 u32 old, new;
73
74 old = aarch64_insn_gen_nop();
75 new = aarch64_insn_gen_branch_imm(pc, addr, true);
76
77 return ftrace_modify_code(pc, old, new, true);
78}
79
80/*
81 * Turn off the call to ftrace_caller() in instrumented function
82 */
83int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
84 unsigned long addr)
85{
86 unsigned long pc = rec->ip;
87 u32 old, new;
88
89 old = aarch64_insn_gen_branch_imm(pc, addr, true);
90 new = aarch64_insn_gen_nop();
91
92 return ftrace_modify_code(pc, old, new, true);
93}
94
95int __init ftrace_dyn_arch_init(void)
96{
97 return 0;
98}
99#endif /* CONFIG_DYNAMIC_FTRACE */
100
101#ifdef CONFIG_FUNCTION_GRAPH_TRACER
102/*
103 * function_graph tracer expects ftrace_return_to_handler() to be called
104 * on the way back to parent. For this purpose, this function is called
105 * in _mcount() or ftrace_caller() to replace return address (*parent) on
106 * the call stack to return_to_handler.
107 *
108 * Note that @frame_pointer is used only for sanity check later.
109 */
110void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
111 unsigned long frame_pointer)
112{
113 unsigned long return_hooker = (unsigned long)&return_to_handler;
114 unsigned long old;
115 struct ftrace_graph_ent trace;
116 int err;
117
118 if (unlikely(atomic_read(&current->tracing_graph_pause)))
119 return;
120
121 /*
122 * Note:
123 * No protection against faulting at *parent, which may be seen
124 * on other archs. It's unlikely on AArch64.
125 */
126 old = *parent;
127 *parent = return_hooker;
128
129 trace.func = self_addr;
130 trace.depth = current->curr_ret_stack + 1;
131
132 /* Only trace if the calling function expects to */
133 if (!ftrace_graph_entry(&trace)) {
134 *parent = old;
135 return;
136 }
137
138 err = ftrace_push_return_trace(old, self_addr, &trace.depth,
139 frame_pointer);
140 if (err == -EBUSY) {
141 *parent = old;
142 return;
143 }
144}
145
146#ifdef CONFIG_DYNAMIC_FTRACE
147/*
148 * Turn on/off the call to ftrace_graph_caller() in ftrace_caller()
149 * depending on @enable.
150 */
151static int ftrace_modify_graph_caller(bool enable)
152{
153 unsigned long pc = (unsigned long)&ftrace_graph_call;
154 u32 branch, nop;
155
156 branch = aarch64_insn_gen_branch_imm(pc,
157 (unsigned long)ftrace_graph_caller, false);
158 nop = aarch64_insn_gen_nop();
159
160 if (enable)
161 return ftrace_modify_code(pc, nop, branch, true);
162 else
163 return ftrace_modify_code(pc, branch, nop, true);
164}
165
166int ftrace_enable_ftrace_graph_caller(void)
167{
168 return ftrace_modify_graph_caller(true);
169}
170
171int ftrace_disable_ftrace_graph_caller(void)
172{
173 return ftrace_modify_graph_caller(false);
174}
175#endif /* CONFIG_DYNAMIC_FTRACE */
176#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 738291b5be29..a96d3a6a63f6 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -342,11 +342,9 @@ ENTRY(set_cpu_boot_mode_flag)
342 cmp w20, #BOOT_CPU_MODE_EL2 342 cmp w20, #BOOT_CPU_MODE_EL2
343 b.ne 1f 343 b.ne 1f
344 add x1, x1, #4 344 add x1, x1, #4
3451: dc cvac, x1 // Clean potentially dirty cache line 3451: str w20, [x1] // This CPU has booted in EL1
346 dsb sy 346 dmb sy
347 str w20, [x1] // This CPU has booted in EL1 347 dc ivac, x1 // Invalidate potentially stale cache line
348 dc civac, x1 // Clean&invalidate potentially stale cache line
349 dsb sy
350 ret 348 ret
351ENDPROC(set_cpu_boot_mode_flag) 349ENDPROC(set_cpu_boot_mode_flag)
352 350
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index bee789757806..df1cf15377b4 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -20,6 +20,7 @@
20 20
21#define pr_fmt(fmt) "hw-breakpoint: " fmt 21#define pr_fmt(fmt) "hw-breakpoint: " fmt
22 22
23#include <linux/compat.h>
23#include <linux/cpu_pm.h> 24#include <linux/cpu_pm.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/hw_breakpoint.h> 26#include <linux/hw_breakpoint.h>
@@ -27,7 +28,6 @@
27#include <linux/ptrace.h> 28#include <linux/ptrace.h>
28#include <linux/smp.h> 29#include <linux/smp.h>
29 30
30#include <asm/compat.h>
31#include <asm/current.h> 31#include <asm/current.h>
32#include <asm/debug-monitors.h> 32#include <asm/debug-monitors.h>
33#include <asm/hw_breakpoint.h> 33#include <asm/hw_breakpoint.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6391485f342d..43b7c34f92cb 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -20,6 +20,7 @@
20 20
21#include <stdarg.h> 21#include <stdarg.h>
22 22
23#include <linux/compat.h>
23#include <linux/export.h> 24#include <linux/export.h>
24#include <linux/sched.h> 25#include <linux/sched.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
@@ -113,32 +114,62 @@ void arch_cpu_idle_dead(void)
113} 114}
114#endif 115#endif
115 116
117/*
118 * Called by kexec, immediately prior to machine_kexec().
119 *
120 * This must completely disable all secondary CPUs; simply causing those CPUs
121 * to execute e.g. a RAM-based pin loop is not sufficient. This allows the
122 * kexec'd kernel to use any and all RAM as it sees fit, without having to
123 * avoid any code or data used by any SW CPU pin loop. The CPU hotplug
124 * functionality embodied in disable_nonboot_cpus() to achieve this.
125 */
116void machine_shutdown(void) 126void machine_shutdown(void)
117{ 127{
118#ifdef CONFIG_SMP 128 disable_nonboot_cpus();
119 smp_send_stop();
120#endif
121} 129}
122 130
131/*
132 * Halting simply requires that the secondary CPUs stop performing any
133 * activity (executing tasks, handling interrupts). smp_send_stop()
134 * achieves this.
135 */
123void machine_halt(void) 136void machine_halt(void)
124{ 137{
125 machine_shutdown(); 138 local_irq_disable();
139 smp_send_stop();
126 while (1); 140 while (1);
127} 141}
128 142
143/*
144 * Power-off simply requires that the secondary CPUs stop performing any
145 * activity (executing tasks, handling interrupts). smp_send_stop()
146 * achieves this. When the system power is turned off, it will take all CPUs
147 * with it.
148 */
129void machine_power_off(void) 149void machine_power_off(void)
130{ 150{
131 machine_shutdown(); 151 local_irq_disable();
152 smp_send_stop();
132 if (pm_power_off) 153 if (pm_power_off)
133 pm_power_off(); 154 pm_power_off();
134} 155}
135 156
157/*
158 * Restart requires that the secondary CPUs stop performing any activity
159 * while the primary CPU resets the system. Systems with a single CPU can
160 * use soft_restart() as their machine descriptor's .restart hook, since that
161 * will cause the only available CPU to reset. Systems with multiple CPUs must
162 * provide a HW restart implementation, to ensure that all CPUs reset at once.
163 * This is required so that any code running after reset on the primary CPU
164 * doesn't have to co-ordinate with other CPUs to ensure they aren't still
165 * executing pre-reset code, and using RAM that the primary CPU's code wishes
166 * to use. Implementing such co-ordination would be essentially impossible.
167 */
136void machine_restart(char *cmd) 168void machine_restart(char *cmd)
137{ 169{
138 machine_shutdown();
139
140 /* Disable interrupts first */ 170 /* Disable interrupts first */
141 local_irq_disable(); 171 local_irq_disable();
172 smp_send_stop();
142 173
143 /* Now call the architecture specific reboot code. */ 174 /* Now call the architecture specific reboot code. */
144 if (arm_pm_restart) 175 if (arm_pm_restart)
@@ -205,7 +236,7 @@ void release_thread(struct task_struct *dead_task)
205 236
206int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 237int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
207{ 238{
208 fpsimd_save_state(&current->thread.fpsimd_state); 239 fpsimd_preserve_current_state();
209 *dst = *src; 240 *dst = *src;
210 return 0; 241 return 0;
211} 242}
@@ -300,7 +331,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
300 * Complete any pending TLB or cache maintenance on this CPU in case 331 * Complete any pending TLB or cache maintenance on this CPU in case
301 * the thread migrates to a different CPU. 332 * the thread migrates to a different CPU.
302 */ 333 */
303 dsb(); 334 dsb(ish);
304 335
305 /* the actual thread switch */ 336 /* the actual thread switch */
306 last = cpu_switch_to(prev, next); 337 last = cpu_switch_to(prev, next);
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 6a8928bba03c..3e926b9c0641 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -19,6 +19,7 @@
19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */ 20 */
21 21
22#include <linux/compat.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/sched.h> 24#include <linux/sched.h>
24#include <linux/mm.h> 25#include <linux/mm.h>
@@ -41,6 +42,9 @@
41#include <asm/traps.h> 42#include <asm/traps.h>
42#include <asm/system_misc.h> 43#include <asm/system_misc.h>
43 44
45#define CREATE_TRACE_POINTS
46#include <trace/events/syscalls.h>
47
44/* 48/*
45 * TODO: does not yet catch signals sent when the child dies. 49 * TODO: does not yet catch signals sent when the child dies.
46 * in exit.c or in signal.c. 50 * in exit.c or in signal.c.
@@ -517,6 +521,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
517 return ret; 521 return ret;
518 522
519 target->thread.fpsimd_state.user_fpsimd = newstate; 523 target->thread.fpsimd_state.user_fpsimd = newstate;
524 fpsimd_flush_task_state(target);
520 return ret; 525 return ret;
521} 526}
522 527
@@ -764,6 +769,7 @@ static int compat_vfp_set(struct task_struct *target,
764 uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; 769 uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
765 } 770 }
766 771
772 fpsimd_flush_task_state(target);
767 return ret; 773 return ret;
768} 774}
769 775
@@ -1058,35 +1064,49 @@ long arch_ptrace(struct task_struct *child, long request,
1058 return ptrace_request(child, request, addr, data); 1064 return ptrace_request(child, request, addr, data);
1059} 1065}
1060 1066
1061asmlinkage int syscall_trace(int dir, struct pt_regs *regs) 1067enum ptrace_syscall_dir {
1068 PTRACE_SYSCALL_ENTER = 0,
1069 PTRACE_SYSCALL_EXIT,
1070};
1071
1072static void tracehook_report_syscall(struct pt_regs *regs,
1073 enum ptrace_syscall_dir dir)
1062{ 1074{
1075 int regno;
1063 unsigned long saved_reg; 1076 unsigned long saved_reg;
1064 1077
1065 if (!test_thread_flag(TIF_SYSCALL_TRACE)) 1078 /*
1066 return regs->syscallno; 1079 * A scratch register (ip(r12) on AArch32, x7 on AArch64) is
1067 1080 * used to denote syscall entry/exit:
1068 if (is_compat_task()) { 1081 */
1069 /* AArch32 uses ip (r12) for scratch */ 1082 regno = (is_compat_task() ? 12 : 7);
1070 saved_reg = regs->regs[12]; 1083 saved_reg = regs->regs[regno];
1071 regs->regs[12] = dir; 1084 regs->regs[regno] = dir;
1072 } else {
1073 /*
1074 * Save X7. X7 is used to denote syscall entry/exit:
1075 * X7 = 0 -> entry, = 1 -> exit
1076 */
1077 saved_reg = regs->regs[7];
1078 regs->regs[7] = dir;
1079 }
1080 1085
1081 if (dir) 1086 if (dir == PTRACE_SYSCALL_EXIT)
1082 tracehook_report_syscall_exit(regs, 0); 1087 tracehook_report_syscall_exit(regs, 0);
1083 else if (tracehook_report_syscall_entry(regs)) 1088 else if (tracehook_report_syscall_entry(regs))
1084 regs->syscallno = ~0UL; 1089 regs->syscallno = ~0UL;
1085 1090
1086 if (is_compat_task()) 1091 regs->regs[regno] = saved_reg;
1087 regs->regs[12] = saved_reg; 1092}
1088 else 1093
1089 regs->regs[7] = saved_reg; 1094asmlinkage int syscall_trace_enter(struct pt_regs *regs)
1095{
1096 if (test_thread_flag(TIF_SYSCALL_TRACE))
1097 tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
1098
1099 if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
1100 trace_sys_enter(regs, regs->syscallno);
1090 1101
1091 return regs->syscallno; 1102 return regs->syscallno;
1092} 1103}
1104
1105asmlinkage void syscall_trace_exit(struct pt_regs *regs)
1106{
1107 if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
1108 trace_sys_exit(regs, regs_return_value(regs));
1109
1110 if (test_thread_flag(TIF_SYSCALL_TRACE))
1111 tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
1112}
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
new file mode 100644
index 000000000000..89102a6ffad5
--- /dev/null
+++ b/arch/arm64/kernel/return_address.c
@@ -0,0 +1,55 @@
1/*
2 * arch/arm64/kernel/return_address.c
3 *
4 * Copyright (C) 2013 Linaro Limited
5 * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/export.h>
13#include <linux/ftrace.h>
14
15#include <asm/stacktrace.h>
16
17struct return_address_data {
18 unsigned int level;
19 void *addr;
20};
21
22static int save_return_addr(struct stackframe *frame, void *d)
23{
24 struct return_address_data *data = d;
25
26 if (!data->level) {
27 data->addr = (void *)frame->pc;
28 return 1;
29 } else {
30 --data->level;
31 return 0;
32 }
33}
34
35void *return_address(unsigned int level)
36{
37 struct return_address_data data;
38 struct stackframe frame;
39 register unsigned long current_sp asm ("sp");
40
41 data.level = level + 2;
42 data.addr = NULL;
43
44 frame.fp = (unsigned long)__builtin_frame_address(0);
45 frame.sp = current_sp;
46 frame.pc = (unsigned long)return_address; /* dummy */
47
48 walk_stackframe(&frame, save_return_addr, &data);
49
50 if (!data.level)
51 return data.addr;
52 else
53 return NULL;
54}
55EXPORT_SYMBOL_GPL(return_address);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index e578171b22ff..46d1125571f6 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -25,6 +25,7 @@
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/initrd.h> 26#include <linux/initrd.h>
27#include <linux/console.h> 27#include <linux/console.h>
28#include <linux/cache.h>
28#include <linux/bootmem.h> 29#include <linux/bootmem.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/screen_info.h> 31#include <linux/screen_info.h>
@@ -200,6 +201,8 @@ static void __init setup_processor(void)
200{ 201{
201 struct cpu_info *cpu_info; 202 struct cpu_info *cpu_info;
202 u64 features, block; 203 u64 features, block;
204 u32 cwg;
205 int cls;
203 206
204 cpu_info = lookup_processor_type(read_cpuid_id()); 207 cpu_info = lookup_processor_type(read_cpuid_id());
205 if (!cpu_info) { 208 if (!cpu_info) {
@@ -217,6 +220,18 @@ static void __init setup_processor(void)
217 elf_hwcap = 0; 220 elf_hwcap = 0;
218 221
219 /* 222 /*
223 * Check for sane CTR_EL0.CWG value.
224 */
225 cwg = cache_type_cwg();
226 cls = cache_line_size();
227 if (!cwg)
228 pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
229 cls);
230 if (L1_CACHE_BYTES < cls)
231 pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
232 L1_CACHE_BYTES, cls);
233
234 /*
220 * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks. 235 * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks.
221 * The blocks we test below represent incremental functionality 236 * The blocks we test below represent incremental functionality
222 * for non-negative values. Negative values are reserved. 237 * for non-negative values. Negative values are reserved.
@@ -363,7 +378,6 @@ void __init setup_arch(char **cmdline_p)
363 378
364 *cmdline_p = boot_command_line; 379 *cmdline_p = boot_command_line;
365 380
366 init_mem_pgprot();
367 early_ioremap_init(); 381 early_ioremap_init();
368 382
369 parse_early_param(); 383 parse_early_param();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 890a591f75dd..6357b9c6c90e 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,6 +17,7 @@
17 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */ 18 */
19 19
20#include <linux/compat.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/signal.h> 22#include <linux/signal.h>
22#include <linux/personality.h> 23#include <linux/personality.h>
@@ -25,7 +26,6 @@
25#include <linux/tracehook.h> 26#include <linux/tracehook.h>
26#include <linux/ratelimit.h> 27#include <linux/ratelimit.h>
27 28
28#include <asm/compat.h>
29#include <asm/debug-monitors.h> 29#include <asm/debug-monitors.h>
30#include <asm/elf.h> 30#include <asm/elf.h>
31#include <asm/cacheflush.h> 31#include <asm/cacheflush.h>
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
51 int err; 51 int err;
52 52
53 /* dump the hardware registers to the fpsimd_state structure */ 53 /* dump the hardware registers to the fpsimd_state structure */
54 fpsimd_save_state(fpsimd); 54 fpsimd_preserve_current_state();
55 55
56 /* copy the FP and status/control registers */ 56 /* copy the FP and status/control registers */
57 err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); 57 err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
86 __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); 86 __get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
87 87
88 /* load the hardware registers from the fpsimd_state structure */ 88 /* load the hardware registers from the fpsimd_state structure */
89 if (!err) { 89 if (!err)
90 preempt_disable(); 90 fpsimd_update_current_state(&fpsimd);
91 fpsimd_load_state(&fpsimd);
92 preempt_enable();
93 }
94 91
95 return err ? -EFAULT : 0; 92 return err ? -EFAULT : 0;
96} 93}
@@ -100,8 +97,7 @@ static int restore_sigframe(struct pt_regs *regs,
100{ 97{
101 sigset_t set; 98 sigset_t set;
102 int i, err; 99 int i, err;
103 struct aux_context __user *aux = 100 void *aux = sf->uc.uc_mcontext.__reserved;
104 (struct aux_context __user *)sf->uc.uc_mcontext.__reserved;
105 101
106 err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); 102 err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
107 if (err == 0) 103 if (err == 0)
@@ -121,8 +117,11 @@ static int restore_sigframe(struct pt_regs *regs,
121 117
122 err |= !valid_user_regs(&regs->user_regs); 118 err |= !valid_user_regs(&regs->user_regs);
123 119
124 if (err == 0) 120 if (err == 0) {
125 err |= restore_fpsimd_context(&aux->fpsimd); 121 struct fpsimd_context *fpsimd_ctx =
122 container_of(aux, struct fpsimd_context, head);
123 err |= restore_fpsimd_context(fpsimd_ctx);
124 }
126 125
127 return err; 126 return err;
128} 127}
@@ -167,8 +166,8 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
167 struct pt_regs *regs, sigset_t *set) 166 struct pt_regs *regs, sigset_t *set)
168{ 167{
169 int i, err = 0; 168 int i, err = 0;
170 struct aux_context __user *aux = 169 void *aux = sf->uc.uc_mcontext.__reserved;
171 (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; 170 struct _aarch64_ctx *end;
172 171
173 /* set up the stack frame for unwinding */ 172 /* set up the stack frame for unwinding */
174 __put_user_error(regs->regs[29], &sf->fp, err); 173 __put_user_error(regs->regs[29], &sf->fp, err);
@@ -185,12 +184,27 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
185 184
186 err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); 185 err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
187 186
188 if (err == 0) 187 if (err == 0) {
189 err |= preserve_fpsimd_context(&aux->fpsimd); 188 struct fpsimd_context *fpsimd_ctx =
189 container_of(aux, struct fpsimd_context, head);
190 err |= preserve_fpsimd_context(fpsimd_ctx);
191 aux += sizeof(*fpsimd_ctx);
192 }
193
194 /* fault information, if valid */
195 if (current->thread.fault_code) {
196 struct esr_context *esr_ctx =
197 container_of(aux, struct esr_context, head);
198 __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
199 __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
200 __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
201 aux += sizeof(*esr_ctx);
202 }
190 203
191 /* set the "end" magic */ 204 /* set the "end" magic */
192 __put_user_error(0, &aux->end.magic, err); 205 end = aux;
193 __put_user_error(0, &aux->end.size, err); 206 __put_user_error(0, &end->magic, err);
207 __put_user_error(0, &end->size, err);
194 208
195 return err; 209 return err;
196} 210}
@@ -416,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
416 clear_thread_flag(TIF_NOTIFY_RESUME); 430 clear_thread_flag(TIF_NOTIFY_RESUME);
417 tracehook_notify_resume(regs); 431 tracehook_notify_resume(regs);
418 } 432 }
433
434 if (thread_flags & _TIF_FOREIGN_FPSTATE)
435 fpsimd_restore_current_state();
436
419} 437}
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index b3fc9f5ec6d3..3491c638f172 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -23,6 +23,7 @@
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ratelimit.h> 24#include <linux/ratelimit.h>
25 25
26#include <asm/esr.h>
26#include <asm/fpsimd.h> 27#include <asm/fpsimd.h>
27#include <asm/signal32.h> 28#include <asm/signal32.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
@@ -81,6 +82,8 @@ struct compat_vfp_sigframe {
81#define VFP_MAGIC 0x56465001 82#define VFP_MAGIC 0x56465001
82#define VFP_STORAGE_SIZE sizeof(struct compat_vfp_sigframe) 83#define VFP_STORAGE_SIZE sizeof(struct compat_vfp_sigframe)
83 84
85#define FSR_WRITE_SHIFT (11)
86
84struct compat_aux_sigframe { 87struct compat_aux_sigframe {
85 struct compat_vfp_sigframe vfp; 88 struct compat_vfp_sigframe vfp;
86 89
@@ -219,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
219 * Note that this also saves V16-31, which aren't visible 222 * Note that this also saves V16-31, which aren't visible
220 * in AArch32. 223 * in AArch32.
221 */ 224 */
222 fpsimd_save_state(fpsimd); 225 fpsimd_preserve_current_state();
223 226
224 /* Place structure header on the stack */ 227 /* Place structure header on the stack */
225 __put_user_error(magic, &frame->magic, err); 228 __put_user_error(magic, &frame->magic, err);
@@ -282,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
282 * We don't need to touch the exception register, so 285 * We don't need to touch the exception register, so
283 * reload the hardware state. 286 * reload the hardware state.
284 */ 287 */
285 if (!err) { 288 if (!err)
286 preempt_disable(); 289 fpsimd_update_current_state(&fpsimd);
287 fpsimd_load_state(&fpsimd);
288 preempt_enable();
289 }
290 290
291 return err ? -EFAULT : 0; 291 return err ? -EFAULT : 0;
292} 292}
@@ -500,7 +500,9 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf,
500 __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); 500 __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err);
501 501
502 __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err); 502 __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err);
503 __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.error_code, err); 503 /* set the compat FSR WnR */
504 __put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) <<
505 FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err);
504 __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); 506 __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
505 __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err); 507 __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err);
506 508
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index c3cb160edc69..40f38f46c8e0 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -35,6 +35,7 @@
35#include <linux/clockchips.h> 35#include <linux/clockchips.h>
36#include <linux/completion.h> 36#include <linux/completion.h>
37#include <linux/of.h> 37#include <linux/of.h>
38#include <linux/irq_work.h>
38 39
39#include <asm/atomic.h> 40#include <asm/atomic.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
@@ -62,6 +63,7 @@ enum ipi_msg_type {
62 IPI_CALL_FUNC_SINGLE, 63 IPI_CALL_FUNC_SINGLE,
63 IPI_CPU_STOP, 64 IPI_CPU_STOP,
64 IPI_TIMER, 65 IPI_TIMER,
66 IPI_IRQ_WORK,
65}; 67};
66 68
67/* 69/*
@@ -477,6 +479,14 @@ void arch_send_call_function_single_ipi(int cpu)
477 smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); 479 smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
478} 480}
479 481
482#ifdef CONFIG_IRQ_WORK
483void arch_irq_work_raise(void)
484{
485 if (smp_cross_call)
486 smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
487}
488#endif
489
480static const char *ipi_types[NR_IPI] = { 490static const char *ipi_types[NR_IPI] = {
481#define S(x,s) [x - IPI_RESCHEDULE] = s 491#define S(x,s) [x - IPI_RESCHEDULE] = s
482 S(IPI_RESCHEDULE, "Rescheduling interrupts"), 492 S(IPI_RESCHEDULE, "Rescheduling interrupts"),
@@ -484,6 +494,7 @@ static const char *ipi_types[NR_IPI] = {
484 S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), 494 S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
485 S(IPI_CPU_STOP, "CPU stop interrupts"), 495 S(IPI_CPU_STOP, "CPU stop interrupts"),
486 S(IPI_TIMER, "Timer broadcast interrupts"), 496 S(IPI_TIMER, "Timer broadcast interrupts"),
497 S(IPI_IRQ_WORK, "IRQ work interrupts"),
487}; 498};
488 499
489void show_ipi_list(struct seq_file *p, int prec) 500void show_ipi_list(struct seq_file *p, int prec)
@@ -576,6 +587,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
576 break; 587 break;
577#endif 588#endif
578 589
590#ifdef CONFIG_IRQ_WORK
591 case IPI_IRQ_WORK:
592 irq_enter();
593 irq_work_run();
594 irq_exit();
595 break;
596#endif
597
579 default: 598 default:
580 pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); 599 pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
581 break; 600 break;
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 7a530d2cc807..0347d38eea29 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -30,7 +30,6 @@ extern void secondary_holding_pen(void);
30volatile unsigned long secondary_holding_pen_release = INVALID_HWID; 30volatile unsigned long secondary_holding_pen_release = INVALID_HWID;
31 31
32static phys_addr_t cpu_release_addr[NR_CPUS]; 32static phys_addr_t cpu_release_addr[NR_CPUS];
33static DEFINE_RAW_SPINLOCK(boot_lock);
34 33
35/* 34/*
36 * Write secondary_holding_pen_release in a way that is guaranteed to be 35 * Write secondary_holding_pen_release in a way that is guaranteed to be
@@ -94,14 +93,6 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
94 93
95static int smp_spin_table_cpu_boot(unsigned int cpu) 94static int smp_spin_table_cpu_boot(unsigned int cpu)
96{ 95{
97 unsigned long timeout;
98
99 /*
100 * Set synchronisation state between this boot processor
101 * and the secondary one
102 */
103 raw_spin_lock(&boot_lock);
104
105 /* 96 /*
106 * Update the pen release flag. 97 * Update the pen release flag.
107 */ 98 */
@@ -112,34 +103,7 @@ static int smp_spin_table_cpu_boot(unsigned int cpu)
112 */ 103 */
113 sev(); 104 sev();
114 105
115 timeout = jiffies + (1 * HZ); 106 return 0;
116 while (time_before(jiffies, timeout)) {
117 if (secondary_holding_pen_release == INVALID_HWID)
118 break;
119 udelay(10);
120 }
121
122 /*
123 * Now the secondary core is starting up let it run its
124 * calibrations, then wait for it to finish
125 */
126 raw_spin_unlock(&boot_lock);
127
128 return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0;
129}
130
131static void smp_spin_table_cpu_postboot(void)
132{
133 /*
134 * Let the primary processor know we're out of the pen.
135 */
136 write_pen_release(INVALID_HWID);
137
138 /*
139 * Synchronise with the boot thread.
140 */
141 raw_spin_lock(&boot_lock);
142 raw_spin_unlock(&boot_lock);
143} 107}
144 108
145const struct cpu_operations smp_spin_table_ops = { 109const struct cpu_operations smp_spin_table_ops = {
@@ -147,5 +111,4 @@ const struct cpu_operations smp_spin_table_ops = {
147 .cpu_init = smp_spin_table_cpu_init, 111 .cpu_init = smp_spin_table_cpu_init,
148 .cpu_prepare = smp_spin_table_cpu_prepare, 112 .cpu_prepare = smp_spin_table_cpu_prepare,
149 .cpu_boot = smp_spin_table_cpu_boot, 113 .cpu_boot = smp_spin_table_cpu_boot,
150 .cpu_postboot = smp_spin_table_cpu_postboot,
151}; 114};
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 38f0558f0c0a..55437ba1f5a4 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -35,7 +35,7 @@
35 * ldp x29, x30, [sp] 35 * ldp x29, x30, [sp]
36 * add sp, sp, #0x10 36 * add sp, sp, #0x10
37 */ 37 */
38int unwind_frame(struct stackframe *frame) 38int notrace unwind_frame(struct stackframe *frame)
39{ 39{
40 unsigned long high, low; 40 unsigned long high, low;
41 unsigned long fp = frame->fp; 41 unsigned long fp = frame->fp;
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 6815987b50f8..1a7125c3099b 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -18,6 +18,7 @@
18 * along with this program. If not, see <http://www.gnu.org/licenses/>. 18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */ 19 */
20 20
21#include <linux/clockchips.h>
21#include <linux/export.h> 22#include <linux/export.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/interrupt.h> 24#include <linux/interrupt.h>
@@ -69,6 +70,8 @@ void __init time_init(void)
69 of_clk_init(NULL); 70 of_clk_init(NULL);
70 clocksource_of_init(); 71 clocksource_of_init();
71 72
73 tick_setup_hrtimer_broadcast();
74
72 arch_timer_rate = arch_timer_get_rate(); 75 arch_timer_rate = arch_timer_get_rate();
73 if (!arch_timer_rate) 76 if (!arch_timer_rate)
74 panic("Unable to initialise architected timer.\n"); 77 panic("Unable to initialise architected timer.\n");
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 3e06b0be4ec8..43514f905916 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -17,10 +17,192 @@
17#include <linux/percpu.h> 17#include <linux/percpu.h>
18#include <linux/node.h> 18#include <linux/node.h>
19#include <linux/nodemask.h> 19#include <linux/nodemask.h>
20#include <linux/of.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21 22
22#include <asm/topology.h> 23#include <asm/topology.h>
23 24
25static int __init get_cpu_for_node(struct device_node *node)
26{
27 struct device_node *cpu_node;
28 int cpu;
29
30 cpu_node = of_parse_phandle(node, "cpu", 0);
31 if (!cpu_node)
32 return -1;
33
34 for_each_possible_cpu(cpu) {
35 if (of_get_cpu_node(cpu, NULL) == cpu_node) {
36 of_node_put(cpu_node);
37 return cpu;
38 }
39 }
40
41 pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);
42
43 of_node_put(cpu_node);
44 return -1;
45}
46
47static int __init parse_core(struct device_node *core, int cluster_id,
48 int core_id)
49{
50 char name[10];
51 bool leaf = true;
52 int i = 0;
53 int cpu;
54 struct device_node *t;
55
56 do {
57 snprintf(name, sizeof(name), "thread%d", i);
58 t = of_get_child_by_name(core, name);
59 if (t) {
60 leaf = false;
61 cpu = get_cpu_for_node(t);
62 if (cpu >= 0) {
63 cpu_topology[cpu].cluster_id = cluster_id;
64 cpu_topology[cpu].core_id = core_id;
65 cpu_topology[cpu].thread_id = i;
66 } else {
67 pr_err("%s: Can't get CPU for thread\n",
68 t->full_name);
69 of_node_put(t);
70 return -EINVAL;
71 }
72 of_node_put(t);
73 }
74 i++;
75 } while (t);
76
77 cpu = get_cpu_for_node(core);
78 if (cpu >= 0) {
79 if (!leaf) {
80 pr_err("%s: Core has both threads and CPU\n",
81 core->full_name);
82 return -EINVAL;
83 }
84
85 cpu_topology[cpu].cluster_id = cluster_id;
86 cpu_topology[cpu].core_id = core_id;
87 } else if (leaf) {
88 pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
89 return -EINVAL;
90 }
91
92 return 0;
93}
94
95static int __init parse_cluster(struct device_node *cluster, int depth)
96{
97 char name[10];
98 bool leaf = true;
99 bool has_cores = false;
100 struct device_node *c;
101 static int cluster_id __initdata;
102 int core_id = 0;
103 int i, ret;
104
105 /*
106 * First check for child clusters; we currently ignore any
107 * information about the nesting of clusters and present the
108 * scheduler with a flat list of them.
109 */
110 i = 0;
111 do {
112 snprintf(name, sizeof(name), "cluster%d", i);
113 c = of_get_child_by_name(cluster, name);
114 if (c) {
115 leaf = false;
116 ret = parse_cluster(c, depth + 1);
117 of_node_put(c);
118 if (ret != 0)
119 return ret;
120 }
121 i++;
122 } while (c);
123
124 /* Now check for cores */
125 i = 0;
126 do {
127 snprintf(name, sizeof(name), "core%d", i);
128 c = of_get_child_by_name(cluster, name);
129 if (c) {
130 has_cores = true;
131
132 if (depth == 0) {
133 pr_err("%s: cpu-map children should be clusters\n",
134 c->full_name);
135 of_node_put(c);
136 return -EINVAL;
137 }
138
139 if (leaf) {
140 ret = parse_core(c, cluster_id, core_id++);
141 } else {
142 pr_err("%s: Non-leaf cluster with core %s\n",
143 cluster->full_name, name);
144 ret = -EINVAL;
145 }
146
147 of_node_put(c);
148 if (ret != 0)
149 return ret;
150 }
151 i++;
152 } while (c);
153
154 if (leaf && !has_cores)
155 pr_warn("%s: empty cluster\n", cluster->full_name);
156
157 if (leaf)
158 cluster_id++;
159
160 return 0;
161}
162
163static int __init parse_dt_topology(void)
164{
165 struct device_node *cn, *map;
166 int ret = 0;
167 int cpu;
168
169 cn = of_find_node_by_path("/cpus");
170 if (!cn) {
171 pr_err("No CPU information found in DT\n");
172 return 0;
173 }
174
175 /*
176 * When topology is provided cpu-map is essentially a root
177 * cluster with restricted subnodes.
178 */
179 map = of_get_child_by_name(cn, "cpu-map");
180 if (!map)
181 goto out;
182
183 ret = parse_cluster(map, 0);
184 if (ret != 0)
185 goto out_map;
186
187 /*
188 * Check that all cores are in the topology; the SMP code will
189 * only mark cores described in the DT as possible.
190 */
191 for_each_possible_cpu(cpu) {
192 if (cpu_topology[cpu].cluster_id == -1) {
193 pr_err("CPU%d: No topology information specified\n",
194 cpu);
195 ret = -EINVAL;
196 }
197 }
198
199out_map:
200 of_node_put(map);
201out:
202 of_node_put(cn);
203 return ret;
204}
205
24/* 206/*
25 * cpu topology table 207 * cpu topology table
26 */ 208 */
@@ -39,13 +221,9 @@ static void update_siblings_masks(unsigned int cpuid)
39 221
40 if (cpuid_topo->cluster_id == -1) { 222 if (cpuid_topo->cluster_id == -1) {
41 /* 223 /*
42 * DT does not contain topology information for this cpu 224 * DT does not contain topology information for this cpu.
43 * reset it to default behaviour
44 */ 225 */
45 pr_debug("CPU%u: No topology information configured\n", cpuid); 226 pr_debug("CPU%u: No topology information configured\n", cpuid);
46 cpuid_topo->core_id = 0;
47 cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling);
48 cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling);
49 return; 227 return;
50 } 228 }
51 229
@@ -74,22 +252,32 @@ void store_cpu_topology(unsigned int cpuid)
74 update_siblings_masks(cpuid); 252 update_siblings_masks(cpuid);
75} 253}
76 254
77/* 255static void __init reset_cpu_topology(void)
78 * init_cpu_topology is called at boot when only one cpu is running
79 * which prevent simultaneous write access to cpu_topology array
80 */
81void __init init_cpu_topology(void)
82{ 256{
83 unsigned int cpu; 257 unsigned int cpu;
84 258
85 /* init core mask and power*/
86 for_each_possible_cpu(cpu) { 259 for_each_possible_cpu(cpu) {
87 struct cpu_topology *cpu_topo = &cpu_topology[cpu]; 260 struct cpu_topology *cpu_topo = &cpu_topology[cpu];
88 261
89 cpu_topo->thread_id = -1; 262 cpu_topo->thread_id = -1;
90 cpu_topo->core_id = -1; 263 cpu_topo->core_id = 0;
91 cpu_topo->cluster_id = -1; 264 cpu_topo->cluster_id = -1;
265
92 cpumask_clear(&cpu_topo->core_sibling); 266 cpumask_clear(&cpu_topo->core_sibling);
267 cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
93 cpumask_clear(&cpu_topo->thread_sibling); 268 cpumask_clear(&cpu_topo->thread_sibling);
269 cpumask_set_cpu(cpu, &cpu_topo->thread_sibling);
94 } 270 }
95} 271}
272
273void __init init_cpu_topology(void)
274{
275 reset_cpu_topology();
276
277 /*
278 * Discard anything that was parsed if we hit an error so we
279 * don't use partial information.
280 */
281 if (parse_dt_topology())
282 reset_cpu_topology();
283}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7ffadddb645d..c43cfa9b8304 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -251,10 +251,13 @@ void die(const char *str, struct pt_regs *regs, int err)
251void arm64_notify_die(const char *str, struct pt_regs *regs, 251void arm64_notify_die(const char *str, struct pt_regs *regs,
252 struct siginfo *info, int err) 252 struct siginfo *info, int err)
253{ 253{
254 if (user_mode(regs)) 254 if (user_mode(regs)) {
255 current->thread.fault_address = 0;
256 current->thread.fault_code = err;
255 force_sig_info(info->si_signo, info, current); 257 force_sig_info(info->si_signo, info, current);
256 else 258 } else {
257 die(str, regs, err); 259 die(str, regs, err);
260 }
258} 261}
259 262
260asmlinkage void __exception do_undefinstr(struct pt_regs *regs) 263asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 4ba7a55b49c7..f1e6d5c032e1 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,7 +13,7 @@
13#define ARM_EXIT_DISCARD(x) x 13#define ARM_EXIT_DISCARD(x) x
14 14
15OUTPUT_ARCH(aarch64) 15OUTPUT_ARCH(aarch64)
16ENTRY(stext) 16ENTRY(_text)
17 17
18jiffies = jiffies_64; 18jiffies = jiffies_64;
19 19
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 2c56012cb2d2..b0d1512acf08 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -630,9 +630,15 @@ ENTRY(__kvm_tlb_flush_vmid_ipa)
630 * whole of Stage-1. Weep... 630 * whole of Stage-1. Weep...
631 */ 631 */
632 tlbi ipas2e1is, x1 632 tlbi ipas2e1is, x1
633 dsb sy 633 /*
634 * We have to ensure completion of the invalidation at Stage-2,
635 * since a table walk on another CPU could refill a TLB with a
636 * complete (S1 + S2) walk based on the old Stage-2 mapping if
637 * the Stage-1 invalidation happened first.
638 */
639 dsb ish
634 tlbi vmalle1is 640 tlbi vmalle1is
635 dsb sy 641 dsb ish
636 isb 642 isb
637 643
638 msr vttbr_el2, xzr 644 msr vttbr_el2, xzr
@@ -643,7 +649,7 @@ ENTRY(__kvm_flush_vm_context)
643 dsb ishst 649 dsb ishst
644 tlbi alle1is 650 tlbi alle1is
645 ic ialluis 651 ic ialluis
646 dsb sy 652 dsb ish
647 ret 653 ret
648ENDPROC(__kvm_flush_vm_context) 654ENDPROC(__kvm_flush_vm_context)
649 655
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 03244582bc55..c59a1bdab5eb 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -71,13 +71,13 @@ static u32 get_ccsidr(u32 csselr)
71static void do_dc_cisw(u32 val) 71static void do_dc_cisw(u32 val)
72{ 72{
73 asm volatile("dc cisw, %x0" : : "r" (val)); 73 asm volatile("dc cisw, %x0" : : "r" (val));
74 dsb(); 74 dsb(ish);
75} 75}
76 76
77static void do_dc_csw(u32 val) 77static void do_dc_csw(u32 val)
78{ 78{
79 asm volatile("dc csw, %x0" : : "r" (val)); 79 asm volatile("dc csw, %x0" : : "r" (val));
80 dsb(); 80 dsb(ish);
81} 81}
82 82
83/* See note at ARM ARM B1.14.4 */ 83/* See note at ARM ARM B1.14.4 */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..d98d3e39879e 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,5 @@
1lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ 1lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
2 copy_to_user.o copy_in_user.o copy_page.o \ 2 copy_to_user.o copy_in_user.o copy_page.o \
3 clear_page.o memchr.o memcpy.o memmove.o memset.o \ 3 clear_page.o memchr.o memcpy.o memmove.o memset.o \
4 memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
4 strchr.o strrchr.o 5 strchr.o strrchr.o
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
new file mode 100644
index 000000000000..6ea0776ba6de
--- /dev/null
+++ b/arch/arm64/lib/memcmp.S
@@ -0,0 +1,258 @@
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27
28/*
29* compare memory areas(when two memory areas' offset are different,
30* alignment handled by the hardware)
31*
32* Parameters:
33* x0 - const memory area 1 pointer
34* x1 - const memory area 2 pointer
35* x2 - the maximal compare byte length
36* Returns:
37* x0 - a compare result, maybe less than, equal to, or greater than ZERO
38*/
39
40/* Parameters and result. */
41src1 .req x0
42src2 .req x1
43limit .req x2
44result .req x0
45
46/* Internal variables. */
47data1 .req x3
48data1w .req w3
49data2 .req x4
50data2w .req w4
51has_nul .req x5
52diff .req x6
53endloop .req x7
54tmp1 .req x8
55tmp2 .req x9
56tmp3 .req x10
57pos .req x11
58limit_wd .req x12
59mask .req x13
60
61ENTRY(memcmp)
62 cbz limit, .Lret0
63 eor tmp1, src1, src2
64 tst tmp1, #7
65 b.ne .Lmisaligned8
66 ands tmp1, src1, #7
67 b.ne .Lmutual_align
68 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
69 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
70 /*
71 * The input source addresses are at alignment boundary.
72 * Directly compare eight bytes each time.
73 */
74.Lloop_aligned:
75 ldr data1, [src1], #8
76 ldr data2, [src2], #8
77.Lstart_realigned:
78 subs limit_wd, limit_wd, #1
79 eor diff, data1, data2 /* Non-zero if differences found. */
80 csinv endloop, diff, xzr, cs /* Last Dword or differences. */
81 cbz endloop, .Lloop_aligned
82
83 /* Not reached the limit, must have found a diff. */
84 tbz limit_wd, #63, .Lnot_limit
85
86 /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
87 ands limit, limit, #7
88 b.eq .Lnot_limit
89 /*
90 * The remained bytes less than 8. It is needed to extract valid data
91 * from last eight bytes of the intended memory range.
92 */
93 lsl limit, limit, #3 /* bytes-> bits. */
94 mov mask, #~0
95CPU_BE( lsr mask, mask, limit )
96CPU_LE( lsl mask, mask, limit )
97 bic data1, data1, mask
98 bic data2, data2, mask
99
100 orr diff, diff, mask
101 b .Lnot_limit
102
103.Lmutual_align:
104 /*
105 * Sources are mutually aligned, but are not currently at an
106 * alignment boundary. Round down the addresses and then mask off
107 * the bytes that precede the start point.
108 */
109 bic src1, src1, #7
110 bic src2, src2, #7
111 ldr data1, [src1], #8
112 ldr data2, [src2], #8
113 /*
114 * We can not add limit with alignment offset(tmp1) here. Since the
115 * addition probably make the limit overflown.
116 */
117 sub limit_wd, limit, #1/*limit != 0, so no underflow.*/
118 and tmp3, limit_wd, #7
119 lsr limit_wd, limit_wd, #3
120 add tmp3, tmp3, tmp1
121 add limit_wd, limit_wd, tmp3, lsr #3
122 add limit, limit, tmp1/* Adjust the limit for the extra. */
123
124 lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
125 neg tmp1, tmp1/* Bits to alignment -64. */
126 mov tmp2, #~0
127 /*mask off the non-intended bytes before the start address.*/
128CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
129 /* Little-endian. Early bytes are at LSB. */
130CPU_LE( lsr tmp2, tmp2, tmp1 )
131
132 orr data1, data1, tmp2
133 orr data2, data2, tmp2
134 b .Lstart_realigned
135
136 /*src1 and src2 have different alignment offset.*/
137.Lmisaligned8:
138 cmp limit, #8
139 b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/
140
141 and tmp1, src1, #7
142 neg tmp1, tmp1
143 add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
144 and tmp2, src2, #7
145 neg tmp2, tmp2
146 add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
147 subs tmp3, tmp1, tmp2
148 csel pos, tmp1, tmp2, hi /*Choose the maximum.*/
149
150 sub limit, limit, pos
151 /*compare the proceeding bytes in the first 8 byte segment.*/
152.Ltinycmp:
153 ldrb data1w, [src1], #1
154 ldrb data2w, [src2], #1
155 subs pos, pos, #1
156 ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
157 b.eq .Ltinycmp
158 cbnz pos, 1f /*diff occurred before the last byte.*/
159 cmp data1w, data2w
160 b.eq .Lstart_align
1611:
162 sub result, data1, data2
163 ret
164
165.Lstart_align:
166 lsr limit_wd, limit, #3
167 cbz limit_wd, .Lremain8
168
169 ands xzr, src1, #7
170 b.eq .Lrecal_offset
171 /*process more leading bytes to make src1 aligned...*/
172 add src1, src1, tmp3 /*backwards src1 to alignment boundary*/
173 add src2, src2, tmp3
174 sub limit, limit, tmp3
175 lsr limit_wd, limit, #3
176 cbz limit_wd, .Lremain8
177 /*load 8 bytes from aligned SRC1..*/
178 ldr data1, [src1], #8
179 ldr data2, [src2], #8
180
181 subs limit_wd, limit_wd, #1
182 eor diff, data1, data2 /*Non-zero if differences found.*/
183 csinv endloop, diff, xzr, ne
184 cbnz endloop, .Lunequal_proc
185 /*How far is the current SRC2 from the alignment boundary...*/
186 and tmp3, tmp3, #7
187
188.Lrecal_offset:/*src1 is aligned now..*/
189 neg pos, tmp3
190.Lloopcmp_proc:
191 /*
192 * Divide the eight bytes into two parts. First,backwards the src2
193 * to an alignment boundary,load eight bytes and compare from
194 * the SRC2 alignment boundary. If all 8 bytes are equal,then start
195 * the second part's comparison. Otherwise finish the comparison.
196 * This special handle can garantee all the accesses are in the
197 * thread/task space in avoid to overrange access.
198 */
199 ldr data1, [src1,pos]
200 ldr data2, [src2,pos]
201 eor diff, data1, data2 /* Non-zero if differences found. */
202 cbnz diff, .Lnot_limit
203
204 /*The second part process*/
205 ldr data1, [src1], #8
206 ldr data2, [src2], #8
207 eor diff, data1, data2 /* Non-zero if differences found. */
208 subs limit_wd, limit_wd, #1
209 csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
210 cbz endloop, .Lloopcmp_proc
211.Lunequal_proc:
212 cbz diff, .Lremain8
213
214/*There is differnence occured in the latest comparison.*/
215.Lnot_limit:
216/*
217* For little endian,reverse the low significant equal bits into MSB,then
218* following CLZ can find how many equal bits exist.
219*/
220CPU_LE( rev diff, diff )
221CPU_LE( rev data1, data1 )
222CPU_LE( rev data2, data2 )
223
224 /*
225 * The MS-non-zero bit of DIFF marks either the first bit
226 * that is different, or the end of the significant data.
227 * Shifting left now will bring the critical information into the
228 * top bits.
229 */
230 clz pos, diff
231 lsl data1, data1, pos
232 lsl data2, data2, pos
233 /*
234 * We need to zero-extend (char is unsigned) the value and then
235 * perform a signed subtraction.
236 */
237 lsr data1, data1, #56
238 sub result, data1, data2, lsr #56
239 ret
240
241.Lremain8:
242 /* Limit % 8 == 0 =>. all data are equal.*/
243 ands limit, limit, #7
244 b.eq .Lret0
245
246.Ltiny8proc:
247 ldrb data1w, [src1], #1
248 ldrb data2w, [src2], #1
249 subs limit, limit, #1
250
251 ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
252 b.eq .Ltiny8proc
253 sub result, data1, data2
254 ret
255.Lret0:
256 mov result, #0
257 ret
258ENDPROC(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003609b6..8a9a96d3ddae 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
1/* 1/*
2 * Copyright (C) 2013 ARM Ltd. 2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
3 * 11 *
4 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 13 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
16 24
17#include <linux/linkage.h> 25#include <linux/linkage.h>
18#include <asm/assembler.h> 26#include <asm/assembler.h>
27#include <asm/cache.h>
19 28
20/* 29/*
21 * Copy a buffer from src to dest (alignment handled by the hardware) 30 * Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
27 * Returns: 36 * Returns:
28 * x0 - dest 37 * x0 - dest
29 */ 38 */
39dstin .req x0
40src .req x1
41count .req x2
42tmp1 .req x3
43tmp1w .req w3
44tmp2 .req x4
45tmp2w .req w4
46tmp3 .req x5
47tmp3w .req w5
48dst .req x6
49
50A_l .req x7
51A_h .req x8
52B_l .req x9
53B_h .req x10
54C_l .req x11
55C_h .req x12
56D_l .req x13
57D_h .req x14
58
30ENTRY(memcpy) 59ENTRY(memcpy)
31 mov x4, x0 60 mov dst, dstin
32 subs x2, x2, #8 61 cmp count, #16
33 b.mi 2f 62 /*When memory length is less than 16, the accessed are not aligned.*/
341: ldr x3, [x1], #8 63 b.lo .Ltiny15
35 subs x2, x2, #8 64
36 str x3, [x4], #8 65 neg tmp2, src
37 b.pl 1b 66 ands tmp2, tmp2, #15/* Bytes to reach alignment. */
382: adds x2, x2, #4 67 b.eq .LSrcAligned
39 b.mi 3f 68 sub count, count, tmp2
40 ldr w3, [x1], #4 69 /*
41 sub x2, x2, #4 70 * Copy the leading memory data from src to dst in an increasing
42 str w3, [x4], #4 71 * address order.By this way,the risk of overwritting the source
433: adds x2, x2, #2 72 * memory data is eliminated when the distance between src and
44 b.mi 4f 73 * dst is less than 16. The memory accesses here are alignment.
45 ldrh w3, [x1], #2 74 */
46 sub x2, x2, #2 75 tbz tmp2, #0, 1f
47 strh w3, [x4], #2 76 ldrb tmp1w, [src], #1
484: adds x2, x2, #1 77 strb tmp1w, [dst], #1
49 b.mi 5f 781:
50 ldrb w3, [x1] 79 tbz tmp2, #1, 2f
51 strb w3, [x4] 80 ldrh tmp1w, [src], #2
525: ret 81 strh tmp1w, [dst], #2
822:
83 tbz tmp2, #2, 3f
84 ldr tmp1w, [src], #4
85 str tmp1w, [dst], #4
863:
87 tbz tmp2, #3, .LSrcAligned
88 ldr tmp1, [src],#8
89 str tmp1, [dst],#8
90
91.LSrcAligned:
92 cmp count, #64
93 b.ge .Lcpy_over64
94 /*
95 * Deal with small copies quickly by dropping straight into the
96 * exit block.
97 */
98.Ltail63:
99 /*
100 * Copy up to 48 bytes of data. At this point we only need the
101 * bottom 6 bits of count to be accurate.
102 */
103 ands tmp1, count, #0x30
104 b.eq .Ltiny15
105 cmp tmp1w, #0x20
106 b.eq 1f
107 b.lt 2f
108 ldp A_l, A_h, [src], #16
109 stp A_l, A_h, [dst], #16
1101:
111 ldp A_l, A_h, [src], #16
112 stp A_l, A_h, [dst], #16
1132:
114 ldp A_l, A_h, [src], #16
115 stp A_l, A_h, [dst], #16
116.Ltiny15:
117 /*
118 * Prefer to break one ldp/stp into several load/store to access
119 * memory in an increasing address order,rather than to load/store 16
120 * bytes from (src-16) to (dst-16) and to backward the src to aligned
121 * address,which way is used in original cortex memcpy. If keeping
122 * the original memcpy process here, memmove need to satisfy the
123 * precondition that src address is at least 16 bytes bigger than dst
124 * address,otherwise some source data will be overwritten when memove
125 * call memcpy directly. To make memmove simpler and decouple the
126 * memcpy's dependency on memmove, withdrew the original process.
127 */
128 tbz count, #3, 1f
129 ldr tmp1, [src], #8
130 str tmp1, [dst], #8
1311:
132 tbz count, #2, 2f
133 ldr tmp1w, [src], #4
134 str tmp1w, [dst], #4
1352:
136 tbz count, #1, 3f
137 ldrh tmp1w, [src], #2
138 strh tmp1w, [dst], #2
1393:
140 tbz count, #0, .Lexitfunc
141 ldrb tmp1w, [src]
142 strb tmp1w, [dst]
143
144.Lexitfunc:
145 ret
146
147.Lcpy_over64:
148 subs count, count, #128
149 b.ge .Lcpy_body_large
150 /*
151 * Less than 128 bytes to copy, so handle 64 here and then jump
152 * to the tail.
153 */
154 ldp A_l, A_h, [src],#16
155 stp A_l, A_h, [dst],#16
156 ldp B_l, B_h, [src],#16
157 ldp C_l, C_h, [src],#16
158 stp B_l, B_h, [dst],#16
159 stp C_l, C_h, [dst],#16
160 ldp D_l, D_h, [src],#16
161 stp D_l, D_h, [dst],#16
162
163 tst count, #0x3f
164 b.ne .Ltail63
165 ret
166
167 /*
168 * Critical loop. Start at a new cache line boundary. Assuming
169 * 64 bytes per line this ensures the entire loop is in one line.
170 */
171 .p2align L1_CACHE_SHIFT
172.Lcpy_body_large:
173 /* pre-get 64 bytes data. */
174 ldp A_l, A_h, [src],#16
175 ldp B_l, B_h, [src],#16
176 ldp C_l, C_h, [src],#16
177 ldp D_l, D_h, [src],#16
1781:
179 /*
180 * interlace the load of next 64 bytes data block with store of the last
181 * loaded 64 bytes data.
182 */
183 stp A_l, A_h, [dst],#16
184 ldp A_l, A_h, [src],#16
185 stp B_l, B_h, [dst],#16
186 ldp B_l, B_h, [src],#16
187 stp C_l, C_h, [dst],#16
188 ldp C_l, C_h, [src],#16
189 stp D_l, D_h, [dst],#16
190 ldp D_l, D_h, [src],#16
191 subs count, count, #64
192 b.ge 1b
193 stp A_l, A_h, [dst],#16
194 stp B_l, B_h, [dst],#16
195 stp C_l, C_h, [dst],#16
196 stp D_l, D_h, [dst],#16
197
198 tst count, #0x3f
199 b.ne .Ltail63
200 ret
53ENDPROC(memcpy) 201ENDPROC(memcpy)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
index b79fdfa42d39..57b19ea2dad4 100644
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@@ -1,5 +1,13 @@
1/* 1/*
2 * Copyright (C) 2013 ARM Ltd. 2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
3 * 11 *
4 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 13 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
16 24
17#include <linux/linkage.h> 25#include <linux/linkage.h>
18#include <asm/assembler.h> 26#include <asm/assembler.h>
27#include <asm/cache.h>
19 28
20/* 29/*
21 * Move a buffer from src to test (alignment handled by the hardware). 30 * Move a buffer from src to test (alignment handled by the hardware).
@@ -28,30 +37,161 @@
28 * Returns: 37 * Returns:
29 * x0 - dest 38 * x0 - dest
30 */ 39 */
40dstin .req x0
41src .req x1
42count .req x2
43tmp1 .req x3
44tmp1w .req w3
45tmp2 .req x4
46tmp2w .req w4
47tmp3 .req x5
48tmp3w .req w5
49dst .req x6
50
51A_l .req x7
52A_h .req x8
53B_l .req x9
54B_h .req x10
55C_l .req x11
56C_h .req x12
57D_l .req x13
58D_h .req x14
59
31ENTRY(memmove) 60ENTRY(memmove)
32 cmp x0, x1 61 cmp dstin, src
33 b.ls memcpy 62 b.lo memcpy
34 add x4, x0, x2 63 add tmp1, src, count
35 add x1, x1, x2 64 cmp dstin, tmp1
36 subs x2, x2, #8 65 b.hs memcpy /* No overlap. */
37 b.mi 2f 66
381: ldr x3, [x1, #-8]! 67 add dst, dstin, count
39 subs x2, x2, #8 68 add src, src, count
40 str x3, [x4, #-8]! 69 cmp count, #16
41 b.pl 1b 70 b.lo .Ltail15 /*probably non-alignment accesses.*/
422: adds x2, x2, #4 71
43 b.mi 3f 72 ands tmp2, src, #15 /* Bytes to reach alignment. */
44 ldr w3, [x1, #-4]! 73 b.eq .LSrcAligned
45 sub x2, x2, #4 74 sub count, count, tmp2
46 str w3, [x4, #-4]! 75 /*
473: adds x2, x2, #2 76 * process the aligned offset length to make the src aligned firstly.
48 b.mi 4f 77 * those extra instructions' cost is acceptable. It also make the
49 ldrh w3, [x1, #-2]! 78 * coming accesses are based on aligned address.
50 sub x2, x2, #2 79 */
51 strh w3, [x4, #-2]! 80 tbz tmp2, #0, 1f
524: adds x2, x2, #1 81 ldrb tmp1w, [src, #-1]!
53 b.mi 5f 82 strb tmp1w, [dst, #-1]!
54 ldrb w3, [x1, #-1] 831:
55 strb w3, [x4, #-1] 84 tbz tmp2, #1, 2f
565: ret 85 ldrh tmp1w, [src, #-2]!
86 strh tmp1w, [dst, #-2]!
872:
88 tbz tmp2, #2, 3f
89 ldr tmp1w, [src, #-4]!
90 str tmp1w, [dst, #-4]!
913:
92 tbz tmp2, #3, .LSrcAligned
93 ldr tmp1, [src, #-8]!
94 str tmp1, [dst, #-8]!
95
96.LSrcAligned:
97 cmp count, #64
98 b.ge .Lcpy_over64
99
100 /*
101 * Deal with small copies quickly by dropping straight into the
102 * exit block.
103 */
104.Ltail63:
105 /*
106 * Copy up to 48 bytes of data. At this point we only need the
107 * bottom 6 bits of count to be accurate.
108 */
109 ands tmp1, count, #0x30
110 b.eq .Ltail15
111 cmp tmp1w, #0x20
112 b.eq 1f
113 b.lt 2f
114 ldp A_l, A_h, [src, #-16]!
115 stp A_l, A_h, [dst, #-16]!
1161:
117 ldp A_l, A_h, [src, #-16]!
118 stp A_l, A_h, [dst, #-16]!
1192:
120 ldp A_l, A_h, [src, #-16]!
121 stp A_l, A_h, [dst, #-16]!
122
123.Ltail15:
124 tbz count, #3, 1f
125 ldr tmp1, [src, #-8]!
126 str tmp1, [dst, #-8]!
1271:
128 tbz count, #2, 2f
129 ldr tmp1w, [src, #-4]!
130 str tmp1w, [dst, #-4]!
1312:
132 tbz count, #1, 3f
133 ldrh tmp1w, [src, #-2]!
134 strh tmp1w, [dst, #-2]!
1353:
136 tbz count, #0, .Lexitfunc
137 ldrb tmp1w, [src, #-1]
138 strb tmp1w, [dst, #-1]
139
140.Lexitfunc:
141 ret
142
143.Lcpy_over64:
144 subs count, count, #128
145 b.ge .Lcpy_body_large
146 /*
147 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
148 * to the tail.
149 */
150 ldp A_l, A_h, [src, #-16]
151 stp A_l, A_h, [dst, #-16]
152 ldp B_l, B_h, [src, #-32]
153 ldp C_l, C_h, [src, #-48]
154 stp B_l, B_h, [dst, #-32]
155 stp C_l, C_h, [dst, #-48]
156 ldp D_l, D_h, [src, #-64]!
157 stp D_l, D_h, [dst, #-64]!
158
159 tst count, #0x3f
160 b.ne .Ltail63
161 ret
162
163 /*
164 * Critical loop. Start at a new cache line boundary. Assuming
165 * 64 bytes per line this ensures the entire loop is in one line.
166 */
167 .p2align L1_CACHE_SHIFT
168.Lcpy_body_large:
169 /* pre-load 64 bytes data. */
170 ldp A_l, A_h, [src, #-16]
171 ldp B_l, B_h, [src, #-32]
172 ldp C_l, C_h, [src, #-48]
173 ldp D_l, D_h, [src, #-64]!
1741:
175 /*
176 * interlace the load of next 64 bytes data block with store of the last
177 * loaded 64 bytes data.
178 */
179 stp A_l, A_h, [dst, #-16]
180 ldp A_l, A_h, [src, #-16]
181 stp B_l, B_h, [dst, #-32]
182 ldp B_l, B_h, [src, #-32]
183 stp C_l, C_h, [dst, #-48]
184 ldp C_l, C_h, [src, #-48]
185 stp D_l, D_h, [dst, #-64]!
186 ldp D_l, D_h, [src, #-64]!
187 subs count, count, #64
188 b.ge 1b
189 stp A_l, A_h, [dst, #-16]
190 stp B_l, B_h, [dst, #-32]
191 stp C_l, C_h, [dst, #-48]
192 stp D_l, D_h, [dst, #-64]!
193
194 tst count, #0x3f
195 b.ne .Ltail63
196 ret
57ENDPROC(memmove) 197ENDPROC(memmove)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 87e4a68fbbbc..7c72dfd36b63 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -1,5 +1,13 @@
1/* 1/*
2 * Copyright (C) 2013 ARM Ltd. 2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
3 * 11 *
4 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 13 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
16 24
17#include <linux/linkage.h> 25#include <linux/linkage.h>
18#include <asm/assembler.h> 26#include <asm/assembler.h>
27#include <asm/cache.h>
19 28
20/* 29/*
21 * Fill in the buffer with character c (alignment handled by the hardware) 30 * Fill in the buffer with character c (alignment handled by the hardware)
@@ -27,27 +36,181 @@
27 * Returns: 36 * Returns:
28 * x0 - buf 37 * x0 - buf
29 */ 38 */
39
40dstin .req x0
41val .req w1
42count .req x2
43tmp1 .req x3
44tmp1w .req w3
45tmp2 .req x4
46tmp2w .req w4
47zva_len_x .req x5
48zva_len .req w5
49zva_bits_x .req x6
50
51A_l .req x7
52A_lw .req w7
53dst .req x8
54tmp3w .req w9
55tmp3 .req x9
56
30ENTRY(memset) 57ENTRY(memset)
31 mov x4, x0 58 mov dst, dstin /* Preserve return value. */
32 and w1, w1, #0xff 59 and A_lw, val, #255
33 orr w1, w1, w1, lsl #8 60 orr A_lw, A_lw, A_lw, lsl #8
34 orr w1, w1, w1, lsl #16 61 orr A_lw, A_lw, A_lw, lsl #16
35 orr x1, x1, x1, lsl #32 62 orr A_l, A_l, A_l, lsl #32
36 subs x2, x2, #8 63
37 b.mi 2f 64 cmp count, #15
381: str x1, [x4], #8 65 b.hi .Lover16_proc
39 subs x2, x2, #8 66 /*All store maybe are non-aligned..*/
40 b.pl 1b 67 tbz count, #3, 1f
412: adds x2, x2, #4 68 str A_l, [dst], #8
42 b.mi 3f 691:
43 sub x2, x2, #4 70 tbz count, #2, 2f
44 str w1, [x4], #4 71 str A_lw, [dst], #4
453: adds x2, x2, #2 722:
46 b.mi 4f 73 tbz count, #1, 3f
47 sub x2, x2, #2 74 strh A_lw, [dst], #2
48 strh w1, [x4], #2 753:
494: adds x2, x2, #1 76 tbz count, #0, 4f
50 b.mi 5f 77 strb A_lw, [dst]
51 strb w1, [x4] 784:
525: ret 79 ret
80
81.Lover16_proc:
82 /*Whether the start address is aligned with 16.*/
83 neg tmp2, dst
84 ands tmp2, tmp2, #15
85 b.eq .Laligned
86/*
87* The count is not less than 16, we can use stp to store the start 16 bytes,
88* then adjust the dst aligned with 16.This process will make the current
89* memory address at alignment boundary.
90*/
91 stp A_l, A_l, [dst] /*non-aligned store..*/
92 /*make the dst aligned..*/
93 sub count, count, tmp2
94 add dst, dst, tmp2
95
96.Laligned:
97 cbz A_l, .Lzero_mem
98
99.Ltail_maybe_long:
100 cmp count, #64
101 b.ge .Lnot_short
102.Ltail63:
103 ands tmp1, count, #0x30
104 b.eq 3f
105 cmp tmp1w, #0x20
106 b.eq 1f
107 b.lt 2f
108 stp A_l, A_l, [dst], #16
1091:
110 stp A_l, A_l, [dst], #16
1112:
112 stp A_l, A_l, [dst], #16
113/*
114* The last store length is less than 16,use stp to write last 16 bytes.
115* It will lead some bytes written twice and the access is non-aligned.
116*/
1173:
118 ands count, count, #15
119 cbz count, 4f
120 add dst, dst, count
121 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
1224:
123 ret
124
125 /*
126 * Critical loop. Start at a new cache line boundary. Assuming
127 * 64 bytes per line, this ensures the entire loop is in one line.
128 */
129 .p2align L1_CACHE_SHIFT
130.Lnot_short:
131 sub dst, dst, #16/* Pre-bias. */
132 sub count, count, #64
1331:
134 stp A_l, A_l, [dst, #16]
135 stp A_l, A_l, [dst, #32]
136 stp A_l, A_l, [dst, #48]
137 stp A_l, A_l, [dst, #64]!
138 subs count, count, #64
139 b.ge 1b
140 tst count, #0x3f
141 add dst, dst, #16
142 b.ne .Ltail63
143.Lexitfunc:
144 ret
145
146 /*
147 * For zeroing memory, check to see if we can use the ZVA feature to
148 * zero entire 'cache' lines.
149 */
150.Lzero_mem:
151 cmp count, #63
152 b.le .Ltail63
153 /*
154 * For zeroing small amounts of memory, it's not worth setting up
155 * the line-clear code.
156 */
157 cmp count, #128
158 b.lt .Lnot_short /*count is at least 128 bytes*/
159
160 mrs tmp1, dczid_el0
161 tbnz tmp1, #4, .Lnot_short
162 mov tmp3w, #4
163 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
164 lsl zva_len, tmp3w, zva_len
165
166 ands tmp3w, zva_len, #63
167 /*
168 * ensure the zva_len is not less than 64.
169 * It is not meaningful to use ZVA if the block size is less than 64.
170 */
171 b.ne .Lnot_short
172.Lzero_by_line:
173 /*
174 * Compute how far we need to go to become suitably aligned. We're
175 * already at quad-word alignment.
176 */
177 cmp count, zva_len_x
178 b.lt .Lnot_short /* Not enough to reach alignment. */
179 sub zva_bits_x, zva_len_x, #1
180 neg tmp2, dst
181 ands tmp2, tmp2, zva_bits_x
182 b.eq 2f /* Already aligned. */
183 /* Not aligned, check that there's enough to copy after alignment.*/
184 sub tmp1, count, tmp2
185 /*
186 * grantee the remain length to be ZVA is bigger than 64,
187 * avoid to make the 2f's process over mem range.*/
188 cmp tmp1, #64
189 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
190 b.lt .Lnot_short
191 /*
192 * We know that there's at least 64 bytes to zero and that it's safe
193 * to overrun by 64 bytes.
194 */
195 mov count, tmp1
1961:
197 stp A_l, A_l, [dst]
198 stp A_l, A_l, [dst, #16]
199 stp A_l, A_l, [dst, #32]
200 subs tmp2, tmp2, #64
201 stp A_l, A_l, [dst, #48]
202 add dst, dst, #64
203 b.ge 1b
204 /* We've overrun a bit, so adjust dst downwards.*/
205 add dst, dst, tmp2
2062:
207 sub count, count, zva_len_x
2083:
209 dc zva, dst
210 add dst, dst, zva_len_x
211 subs count, count, zva_len_x
212 b.ge 3b
213 ands count, count, zva_bits_x
214 b.ne .Ltail_maybe_long
215 ret
53ENDPROC(memset) 216ENDPROC(memset)
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
new file mode 100644
index 000000000000..42f828b06c59
--- /dev/null
+++ b/arch/arm64/lib/strcmp.S
@@ -0,0 +1,234 @@
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27
28/*
29 * compare two strings
30 *
31 * Parameters:
32 * x0 - const string 1 pointer
33 * x1 - const string 2 pointer
34 * Returns:
35 * x0 - an integer less than, equal to, or greater than zero
36 * if s1 is found, respectively, to be less than, to match,
37 * or be greater than s2.
38 */
39
40#define REP8_01 0x0101010101010101
41#define REP8_7f 0x7f7f7f7f7f7f7f7f
42#define REP8_80 0x8080808080808080
43
44/* Parameters and result. */
45src1 .req x0
46src2 .req x1
47result .req x0
48
49/* Internal variables. */
50data1 .req x2
51data1w .req w2
52data2 .req x3
53data2w .req w3
54has_nul .req x4
55diff .req x5
56syndrome .req x6
57tmp1 .req x7
58tmp2 .req x8
59tmp3 .req x9
60zeroones .req x10
61pos .req x11
62
63ENTRY(strcmp)
64 eor tmp1, src1, src2
65 mov zeroones, #REP8_01
66 tst tmp1, #7
67 b.ne .Lmisaligned8
68 ands tmp1, src1, #7
69 b.ne .Lmutual_align
70
71 /*
72 * NUL detection works on the principle that (X - 1) & (~X) & 0x80
73 * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
74 * can be done in parallel across the entire word.
75 */
76.Lloop_aligned:
77 ldr data1, [src1], #8
78 ldr data2, [src2], #8
79.Lstart_realigned:
80 sub tmp1, data1, zeroones
81 orr tmp2, data1, #REP8_7f
82 eor diff, data1, data2 /* Non-zero if differences found. */
83 bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
84 orr syndrome, diff, has_nul
85 cbz syndrome, .Lloop_aligned
86 b .Lcal_cmpresult
87
88.Lmutual_align:
89 /*
90 * Sources are mutually aligned, but are not currently at an
91 * alignment boundary. Round down the addresses and then mask off
92 * the bytes that preceed the start point.
93 */
94 bic src1, src1, #7
95 bic src2, src2, #7
96 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
97 ldr data1, [src1], #8
98 neg tmp1, tmp1 /* Bits to alignment -64. */
99 ldr data2, [src2], #8
100 mov tmp2, #~0
101 /* Big-endian. Early bytes are at MSB. */
102CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
103 /* Little-endian. Early bytes are at LSB. */
104CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
105
106 orr data1, data1, tmp2
107 orr data2, data2, tmp2
108 b .Lstart_realigned
109
110.Lmisaligned8:
111 /*
112 * Get the align offset length to compare per byte first.
113 * After this process, one string's address will be aligned.
114 */
115 and tmp1, src1, #7
116 neg tmp1, tmp1
117 add tmp1, tmp1, #8
118 and tmp2, src2, #7
119 neg tmp2, tmp2
120 add tmp2, tmp2, #8
121 subs tmp3, tmp1, tmp2
122 csel pos, tmp1, tmp2, hi /*Choose the maximum. */
123.Ltinycmp:
124 ldrb data1w, [src1], #1
125 ldrb data2w, [src2], #1
126 subs pos, pos, #1
127 ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
128 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
129 b.eq .Ltinycmp
130 cbnz pos, 1f /*find the null or unequal...*/
131 cmp data1w, #1
132 ccmp data1w, data2w, #0, cs
133 b.eq .Lstart_align /*the last bytes are equal....*/
1341:
135 sub result, data1, data2
136 ret
137
138.Lstart_align:
139 ands xzr, src1, #7
140 b.eq .Lrecal_offset
141 /*process more leading bytes to make str1 aligned...*/
142 add src1, src1, tmp3
143 add src2, src2, tmp3
144 /*load 8 bytes from aligned str1 and non-aligned str2..*/
145 ldr data1, [src1], #8
146 ldr data2, [src2], #8
147
148 sub tmp1, data1, zeroones
149 orr tmp2, data1, #REP8_7f
150 bic has_nul, tmp1, tmp2
151 eor diff, data1, data2 /* Non-zero if differences found. */
152 orr syndrome, diff, has_nul
153 cbnz syndrome, .Lcal_cmpresult
154 /*How far is the current str2 from the alignment boundary...*/
155 and tmp3, tmp3, #7
156.Lrecal_offset:
157 neg pos, tmp3
158.Lloopcmp_proc:
159 /*
160 * Divide the eight bytes into two parts. First,backwards the src2
161 * to an alignment boundary,load eight bytes from the SRC2 alignment
162 * boundary,then compare with the relative bytes from SRC1.
163 * If all 8 bytes are equal,then start the second part's comparison.
164 * Otherwise finish the comparison.
165 * This special handle can garantee all the accesses are in the
166 * thread/task space in avoid to overrange access.
167 */
168 ldr data1, [src1,pos]
169 ldr data2, [src2,pos]
170 sub tmp1, data1, zeroones
171 orr tmp2, data1, #REP8_7f
172 bic has_nul, tmp1, tmp2
173 eor diff, data1, data2 /* Non-zero if differences found. */
174 orr syndrome, diff, has_nul
175 cbnz syndrome, .Lcal_cmpresult
176
177 /*The second part process*/
178 ldr data1, [src1], #8
179 ldr data2, [src2], #8
180 sub tmp1, data1, zeroones
181 orr tmp2, data1, #REP8_7f
182 bic has_nul, tmp1, tmp2
183 eor diff, data1, data2 /* Non-zero if differences found. */
184 orr syndrome, diff, has_nul
185 cbz syndrome, .Lloopcmp_proc
186
187.Lcal_cmpresult:
188 /*
189 * reversed the byte-order as big-endian,then CLZ can find the most
190 * significant zero bits.
191 */
192CPU_LE( rev syndrome, syndrome )
193CPU_LE( rev data1, data1 )
194CPU_LE( rev data2, data2 )
195
196 /*
197 * For big-endian we cannot use the trick with the syndrome value
198 * as carry-propagation can corrupt the upper bits if the trailing
199 * bytes in the string contain 0x01.
200 * However, if there is no NUL byte in the dword, we can generate
201 * the result directly. We ca not just subtract the bytes as the
202 * MSB might be significant.
203 */
204CPU_BE( cbnz has_nul, 1f )
205CPU_BE( cmp data1, data2 )
206CPU_BE( cset result, ne )
207CPU_BE( cneg result, result, lo )
208CPU_BE( ret )
209CPU_BE( 1: )
210 /*Re-compute the NUL-byte detection, using a byte-reversed value. */
211CPU_BE( rev tmp3, data1 )
212CPU_BE( sub tmp1, tmp3, zeroones )
213CPU_BE( orr tmp2, tmp3, #REP8_7f )
214CPU_BE( bic has_nul, tmp1, tmp2 )
215CPU_BE( rev has_nul, has_nul )
216CPU_BE( orr syndrome, diff, has_nul )
217
218 clz pos, syndrome
219 /*
220 * The MS-non-zero bit of the syndrome marks either the first bit
221 * that is different, or the top bit of the first zero byte.
222 * Shifting left now will bring the critical information into the
223 * top bits.
224 */
225 lsl data1, data1, pos
226 lsl data2, data2, pos
227 /*
228 * But we need to zero-extend (char is unsigned) the value and then
229 * perform a signed 32-bit subtraction.
230 */
231 lsr data1, data1, #56
232 sub result, data1, data2, lsr #56
233 ret
234ENDPROC(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
new file mode 100644
index 000000000000..987b68b9ce44
--- /dev/null
+++ b/arch/arm64/lib/strlen.S
@@ -0,0 +1,126 @@
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27
28/*
29 * calculate the length of a string
30 *
31 * Parameters:
32 * x0 - const string pointer
33 * Returns:
34 * x0 - the return length of specific string
35 */
36
37/* Arguments and results. */
38srcin .req x0
39len .req x0
40
41/* Locals and temporaries. */
42src .req x1
43data1 .req x2
44data2 .req x3
45data2a .req x4
46has_nul1 .req x5
47has_nul2 .req x6
48tmp1 .req x7
49tmp2 .req x8
50tmp3 .req x9
51tmp4 .req x10
52zeroones .req x11
53pos .req x12
54
55#define REP8_01 0x0101010101010101
56#define REP8_7f 0x7f7f7f7f7f7f7f7f
57#define REP8_80 0x8080808080808080
58
59ENTRY(strlen)
60 mov zeroones, #REP8_01
61 bic src, srcin, #15
62 ands tmp1, srcin, #15
63 b.ne .Lmisaligned
64 /*
65 * NUL detection works on the principle that (X - 1) & (~X) & 0x80
66 * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
67 * can be done in parallel across the entire word.
68 */
69 /*
70 * The inner loop deals with two Dwords at a time. This has a
71 * slightly higher start-up cost, but we should win quite quickly,
72 * especially on cores with a high number of issue slots per
73 * cycle, as we get much better parallelism out of the operations.
74 */
75.Lloop:
76 ldp data1, data2, [src], #16
77.Lrealigned:
78 sub tmp1, data1, zeroones
79 orr tmp2, data1, #REP8_7f
80 sub tmp3, data2, zeroones
81 orr tmp4, data2, #REP8_7f
82 bic has_nul1, tmp1, tmp2
83 bics has_nul2, tmp3, tmp4
84 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
85 b.eq .Lloop
86
87 sub len, src, srcin
88 cbz has_nul1, .Lnul_in_data2
89CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/
90 sub len, len, #8
91 mov has_nul2, has_nul1
92.Lnul_in_data2:
93 /*
94 * For big-endian, carry propagation (if the final byte in the
95 * string is 0x01) means we cannot use has_nul directly. The
96 * easiest way to get the correct byte is to byte-swap the data
97 * and calculate the syndrome a second time.
98 */
99CPU_BE( rev data2, data2 )
100CPU_BE( sub tmp1, data2, zeroones )
101CPU_BE( orr tmp2, data2, #REP8_7f )
102CPU_BE( bic has_nul2, tmp1, tmp2 )
103
104 sub len, len, #8
105 rev has_nul2, has_nul2
106 clz pos, has_nul2
107 add len, len, pos, lsr #3 /* Bits to bytes. */
108 ret
109
110.Lmisaligned:
111 cmp tmp1, #8
112 neg tmp1, tmp1
113 ldp data1, data2, [src], #16
114 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
115 mov tmp2, #~0
116 /* Big-endian. Early bytes are at MSB. */
117CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
118 /* Little-endian. Early bytes are at LSB. */
119CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
120
121 orr data1, data1, tmp2
122 orr data2a, data2, tmp2
123 csinv data1, data1, xzr, le
124 csel data2, data2, data2a, le
125 b .Lrealigned
126ENDPROC(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
new file mode 100644
index 000000000000..0224cf5a5533
--- /dev/null
+++ b/arch/arm64/lib/strncmp.S
@@ -0,0 +1,310 @@
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27
28/*
29 * compare two strings
30 *
31 * Parameters:
32 * x0 - const string 1 pointer
33 * x1 - const string 2 pointer
34 * x2 - the maximal length to be compared
35 * Returns:
36 * x0 - an integer less than, equal to, or greater than zero if s1 is found,
37 * respectively, to be less than, to match, or be greater than s2.
38 */
39
40#define REP8_01 0x0101010101010101
41#define REP8_7f 0x7f7f7f7f7f7f7f7f
42#define REP8_80 0x8080808080808080
43
44/* Parameters and result. */
45src1 .req x0
46src2 .req x1
47limit .req x2
48result .req x0
49
50/* Internal variables. */
51data1 .req x3
52data1w .req w3
53data2 .req x4
54data2w .req w4
55has_nul .req x5
56diff .req x6
57syndrome .req x7
58tmp1 .req x8
59tmp2 .req x9
60tmp3 .req x10
61zeroones .req x11
62pos .req x12
63limit_wd .req x13
64mask .req x14
65endloop .req x15
66
67ENTRY(strncmp)
68 cbz limit, .Lret0
69 eor tmp1, src1, src2
70 mov zeroones, #REP8_01
71 tst tmp1, #7
72 b.ne .Lmisaligned8
73 ands tmp1, src1, #7
74 b.ne .Lmutual_align
75 /* Calculate the number of full and partial words -1. */
76 /*
77 * when limit is mulitply of 8, if not sub 1,
78 * the judgement of last dword will wrong.
79 */
80 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
81 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
82
83 /*
84 * NUL detection works on the principle that (X - 1) & (~X) & 0x80
85 * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
86 * can be done in parallel across the entire word.
87 */
88.Lloop_aligned:
89 ldr data1, [src1], #8
90 ldr data2, [src2], #8
91.Lstart_realigned:
92 subs limit_wd, limit_wd, #1
93 sub tmp1, data1, zeroones
94 orr tmp2, data1, #REP8_7f
95 eor diff, data1, data2 /* Non-zero if differences found. */
96 csinv endloop, diff, xzr, pl /* Last Dword or differences.*/
97 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
98 ccmp endloop, #0, #0, eq
99 b.eq .Lloop_aligned
100
101 /*Not reached the limit, must have found the end or a diff. */
102 tbz limit_wd, #63, .Lnot_limit
103
104 /* Limit % 8 == 0 => all bytes significant. */
105 ands limit, limit, #7
106 b.eq .Lnot_limit
107
108 lsl limit, limit, #3 /* Bits -> bytes. */
109 mov mask, #~0
110CPU_BE( lsr mask, mask, limit )
111CPU_LE( lsl mask, mask, limit )
112 bic data1, data1, mask
113 bic data2, data2, mask
114
115 /* Make sure that the NUL byte is marked in the syndrome. */
116 orr has_nul, has_nul, mask
117
118.Lnot_limit:
119 orr syndrome, diff, has_nul
120 b .Lcal_cmpresult
121
122.Lmutual_align:
123 /*
124 * Sources are mutually aligned, but are not currently at an
125 * alignment boundary. Round down the addresses and then mask off
126 * the bytes that precede the start point.
127 * We also need to adjust the limit calculations, but without
128 * overflowing if the limit is near ULONG_MAX.
129 */
130 bic src1, src1, #7
131 bic src2, src2, #7
132 ldr data1, [src1], #8
133 neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
134 ldr data2, [src2], #8
135 mov tmp2, #~0
136 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
137 /* Big-endian. Early bytes are at MSB. */
138CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
139 /* Little-endian. Early bytes are at LSB. */
140CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
141
142 and tmp3, limit_wd, #7
143 lsr limit_wd, limit_wd, #3
144 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
145 add limit, limit, tmp1
146 add tmp3, tmp3, tmp1
147 orr data1, data1, tmp2
148 orr data2, data2, tmp2
149 add limit_wd, limit_wd, tmp3, lsr #3
150 b .Lstart_realigned
151
152/*when src1 offset is not equal to src2 offset...*/
153.Lmisaligned8:
154 cmp limit, #8
155 b.lo .Ltiny8proc /*limit < 8... */
156 /*
157 * Get the align offset length to compare per byte first.
158 * After this process, one string's address will be aligned.*/
159 and tmp1, src1, #7
160 neg tmp1, tmp1
161 add tmp1, tmp1, #8
162 and tmp2, src2, #7
163 neg tmp2, tmp2
164 add tmp2, tmp2, #8
165 subs tmp3, tmp1, tmp2
166 csel pos, tmp1, tmp2, hi /*Choose the maximum. */
167 /*
168 * Here, limit is not less than 8, so directly run .Ltinycmp
169 * without checking the limit.*/
170 sub limit, limit, pos
171.Ltinycmp:
172 ldrb data1w, [src1], #1
173 ldrb data2w, [src2], #1
174 subs pos, pos, #1
175 ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
176 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
177 b.eq .Ltinycmp
178 cbnz pos, 1f /*find the null or unequal...*/
179 cmp data1w, #1
180 ccmp data1w, data2w, #0, cs
181 b.eq .Lstart_align /*the last bytes are equal....*/
1821:
183 sub result, data1, data2
184 ret
185
186.Lstart_align:
187 lsr limit_wd, limit, #3
188 cbz limit_wd, .Lremain8
189 /*process more leading bytes to make str1 aligned...*/
190 ands xzr, src1, #7
191 b.eq .Lrecal_offset
192 add src1, src1, tmp3 /*tmp3 is positive in this branch.*/
193 add src2, src2, tmp3
194 ldr data1, [src1], #8
195 ldr data2, [src2], #8
196
197 sub limit, limit, tmp3
198 lsr limit_wd, limit, #3
199 subs limit_wd, limit_wd, #1
200
201 sub tmp1, data1, zeroones
202 orr tmp2, data1, #REP8_7f
203 eor diff, data1, data2 /* Non-zero if differences found. */
204 csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
205 bics has_nul, tmp1, tmp2
206 ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
207 b.ne .Lunequal_proc
208 /*How far is the current str2 from the alignment boundary...*/
209 and tmp3, tmp3, #7
210.Lrecal_offset:
211 neg pos, tmp3
212.Lloopcmp_proc:
213 /*
214 * Divide the eight bytes into two parts. First,backwards the src2
215 * to an alignment boundary,load eight bytes from the SRC2 alignment
216 * boundary,then compare with the relative bytes from SRC1.
217 * If all 8 bytes are equal,then start the second part's comparison.
218 * Otherwise finish the comparison.
219 * This special handle can garantee all the accesses are in the
220 * thread/task space in avoid to overrange access.
221 */
222 ldr data1, [src1,pos]
223 ldr data2, [src2,pos]
224 sub tmp1, data1, zeroones
225 orr tmp2, data1, #REP8_7f
226 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
227 eor diff, data1, data2 /* Non-zero if differences found. */
228 csinv endloop, diff, xzr, eq
229 cbnz endloop, .Lunequal_proc
230
231 /*The second part process*/
232 ldr data1, [src1], #8
233 ldr data2, [src2], #8
234 subs limit_wd, limit_wd, #1
235 sub tmp1, data1, zeroones
236 orr tmp2, data1, #REP8_7f
237 eor diff, data1, data2 /* Non-zero if differences found. */
238 csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
239 bics has_nul, tmp1, tmp2
240 ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
241 b.eq .Lloopcmp_proc
242
243.Lunequal_proc:
244 orr syndrome, diff, has_nul
245 cbz syndrome, .Lremain8
246.Lcal_cmpresult:
247 /*
248 * reversed the byte-order as big-endian,then CLZ can find the most
249 * significant zero bits.
250 */
251CPU_LE( rev syndrome, syndrome )
252CPU_LE( rev data1, data1 )
253CPU_LE( rev data2, data2 )
254 /*
255 * For big-endian we cannot use the trick with the syndrome value
256 * as carry-propagation can corrupt the upper bits if the trailing
257 * bytes in the string contain 0x01.
258 * However, if there is no NUL byte in the dword, we can generate
259 * the result directly. We can't just subtract the bytes as the
260 * MSB might be significant.
261 */
262CPU_BE( cbnz has_nul, 1f )
263CPU_BE( cmp data1, data2 )
264CPU_BE( cset result, ne )
265CPU_BE( cneg result, result, lo )
266CPU_BE( ret )
267CPU_BE( 1: )
268 /* Re-compute the NUL-byte detection, using a byte-reversed value.*/
269CPU_BE( rev tmp3, data1 )
270CPU_BE( sub tmp1, tmp3, zeroones )
271CPU_BE( orr tmp2, tmp3, #REP8_7f )
272CPU_BE( bic has_nul, tmp1, tmp2 )
273CPU_BE( rev has_nul, has_nul )
274CPU_BE( orr syndrome, diff, has_nul )
275 /*
276 * The MS-non-zero bit of the syndrome marks either the first bit
277 * that is different, or the top bit of the first zero byte.
278 * Shifting left now will bring the critical information into the
279 * top bits.
280 */
281 clz pos, syndrome
282 lsl data1, data1, pos
283 lsl data2, data2, pos
284 /*
285 * But we need to zero-extend (char is unsigned) the value and then
286 * perform a signed 32-bit subtraction.
287 */
288 lsr data1, data1, #56
289 sub result, data1, data2, lsr #56
290 ret
291
292.Lremain8:
293 /* Limit % 8 == 0 => all bytes significant. */
294 ands limit, limit, #7
295 b.eq .Lret0
296.Ltiny8proc:
297 ldrb data1w, [src1], #1
298 ldrb data2w, [src2], #1
299 subs limit, limit, #1
300
301 ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
302 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
303 b.eq .Ltiny8proc
304 sub result, data1, data2
305 ret
306
307.Lret0:
308 mov result, #0
309 ret
310ENDPROC(strncmp)
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
new file mode 100644
index 000000000000..2ca665711bf2
--- /dev/null
+++ b/arch/arm64/lib/strnlen.S
@@ -0,0 +1,171 @@
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27
28/*
29 * determine the length of a fixed-size string
30 *
31 * Parameters:
32 * x0 - const string pointer
33 * x1 - maximal string length
34 * Returns:
35 * x0 - the return length of specific string
36 */
37
38/* Arguments and results. */
39srcin .req x0
40len .req x0
41limit .req x1
42
43/* Locals and temporaries. */
44src .req x2
45data1 .req x3
46data2 .req x4
47data2a .req x5
48has_nul1 .req x6
49has_nul2 .req x7
50tmp1 .req x8
51tmp2 .req x9
52tmp3 .req x10
53tmp4 .req x11
54zeroones .req x12
55pos .req x13
56limit_wd .req x14
57
58#define REP8_01 0x0101010101010101
59#define REP8_7f 0x7f7f7f7f7f7f7f7f
60#define REP8_80 0x8080808080808080
61
62ENTRY(strnlen)
63 cbz limit, .Lhit_limit
64 mov zeroones, #REP8_01
65 bic src, srcin, #15
66 ands tmp1, srcin, #15
67 b.ne .Lmisaligned
68 /* Calculate the number of full and partial words -1. */
69 sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
70 lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
71
72 /*
73 * NUL detection works on the principle that (X - 1) & (~X) & 0x80
74 * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
75 * can be done in parallel across the entire word.
76 */
77 /*
78 * The inner loop deals with two Dwords at a time. This has a
79 * slightly higher start-up cost, but we should win quite quickly,
80 * especially on cores with a high number of issue slots per
81 * cycle, as we get much better parallelism out of the operations.
82 */
83.Lloop:
84 ldp data1, data2, [src], #16
85.Lrealigned:
86 sub tmp1, data1, zeroones
87 orr tmp2, data1, #REP8_7f
88 sub tmp3, data2, zeroones
89 orr tmp4, data2, #REP8_7f
90 bic has_nul1, tmp1, tmp2
91 bic has_nul2, tmp3, tmp4
92 subs limit_wd, limit_wd, #1
93 orr tmp1, has_nul1, has_nul2
94 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
95 b.eq .Lloop
96
97 cbz tmp1, .Lhit_limit /* No null in final Qword. */
98
99 /*
100 * We know there's a null in the final Qword. The easiest thing
101 * to do now is work out the length of the string and return
102 * MIN (len, limit).
103 */
104 sub len, src, srcin
105 cbz has_nul1, .Lnul_in_data2
106CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/
107
108 sub len, len, #8
109 mov has_nul2, has_nul1
110.Lnul_in_data2:
111 /*
112 * For big-endian, carry propagation (if the final byte in the
113 * string is 0x01) means we cannot use has_nul directly. The
114 * easiest way to get the correct byte is to byte-swap the data
115 * and calculate the syndrome a second time.
116 */
117CPU_BE( rev data2, data2 )
118CPU_BE( sub tmp1, data2, zeroones )
119CPU_BE( orr tmp2, data2, #REP8_7f )
120CPU_BE( bic has_nul2, tmp1, tmp2 )
121
122 sub len, len, #8
123 rev has_nul2, has_nul2
124 clz pos, has_nul2
125 add len, len, pos, lsr #3 /* Bits to bytes. */
126 cmp len, limit
127 csel len, len, limit, ls /* Return the lower value. */
128 ret
129
130.Lmisaligned:
131 /*
132 * Deal with a partial first word.
133 * We're doing two things in parallel here;
134 * 1) Calculate the number of words (but avoiding overflow if
135 * limit is near ULONG_MAX) - to do this we need to work out
136 * limit + tmp1 - 1 as a 65-bit value before shifting it;
137 * 2) Load and mask the initial data words - we force the bytes
138 * before the ones we are interested in to 0xff - this ensures
139 * early bytes will not hit any zero detection.
140 */
141 ldp data1, data2, [src], #16
142
143 sub limit_wd, limit, #1
144 and tmp3, limit_wd, #15
145 lsr limit_wd, limit_wd, #4
146
147 add tmp3, tmp3, tmp1
148 add limit_wd, limit_wd, tmp3, lsr #4
149
150 neg tmp4, tmp1
151 lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
152
153 mov tmp2, #~0
154 /* Big-endian. Early bytes are at MSB. */
155CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
156 /* Little-endian. Early bytes are at LSB. */
157CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
158
159 cmp tmp1, #8
160
161 orr data1, data1, tmp2
162 orr data2a, data2, tmp2
163
164 csinv data1, data1, xzr, le
165 csel data2, data2, data2a, le
166 b .Lrealigned
167
168.Lhit_limit:
169 mov len, limit
170 ret
171ENDPROC(strnlen)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index b51d36401d83..3ecb56c624d3 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := dma-mapping.o extable.o fault.o init.o \ 1obj-y := dma-mapping.o extable.o fault.o init.o \
2 cache.o copypage.o flush.o \ 2 cache.o copypage.o flush.o \
3 ioremap.o mmap.o pgd.o mmu.o \ 3 ioremap.o mmap.o pgd.o mmu.o \
4 context.o tlb.o proc.o 4 context.o proc.o
5obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 5obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index fda756875fa6..23663837acff 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -31,7 +31,7 @@
31 * Corrupted registers: x0-x7, x9-x11 31 * Corrupted registers: x0-x7, x9-x11
32 */ 32 */
33__flush_dcache_all: 33__flush_dcache_all:
34 dsb sy // ensure ordering with previous memory accesses 34 dmb sy // ensure ordering with previous memory accesses
35 mrs x0, clidr_el1 // read clidr 35 mrs x0, clidr_el1 // read clidr
36 and x3, x0, #0x7000000 // extract loc from clidr 36 and x3, x0, #0x7000000 // extract loc from clidr
37 lsr x3, x3, #23 // left align loc bit field 37 lsr x3, x3, #23 // left align loc bit field
@@ -128,7 +128,7 @@ USER(9f, dc cvau, x4 ) // clean D line to PoU
128 add x4, x4, x2 128 add x4, x4, x2
129 cmp x4, x1 129 cmp x4, x1
130 b.lo 1b 130 b.lo 1b
131 dsb sy 131 dsb ish
132 132
133 icache_line_size x2, x3 133 icache_line_size x2, x3
134 sub x3, x2, #1 134 sub x3, x2, #1
@@ -139,7 +139,7 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU
139 cmp x4, x1 139 cmp x4, x1
140 b.lo 1b 140 b.lo 1b
1419: // ignore any faulting cache operation 1419: // ignore any faulting cache operation
142 dsb sy 142 dsb ish
143 isb 143 isb
144 ret 144 ret
145ENDPROC(flush_icache_range) 145ENDPROC(flush_icache_range)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index c851eb44dc50..4164c5ace9f8 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -115,7 +115,7 @@ static void *__dma_alloc_noncoherent(struct device *dev, size_t size,
115 for (i = 0; i < (size >> PAGE_SHIFT); i++) 115 for (i = 0; i < (size >> PAGE_SHIFT); i++)
116 map[i] = page + i; 116 map[i] = page + i;
117 coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, 117 coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP,
118 __get_dma_pgprot(attrs, pgprot_default, false)); 118 __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false));
119 kfree(map); 119 kfree(map);
120 if (!coherent_ptr) 120 if (!coherent_ptr)
121 goto no_map; 121 goto no_map;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c23751b06120..bcc965e2cce1 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -32,6 +32,7 @@
32 32
33#include <asm/exception.h> 33#include <asm/exception.h>
34#include <asm/debug-monitors.h> 34#include <asm/debug-monitors.h>
35#include <asm/esr.h>
35#include <asm/system_misc.h> 36#include <asm/system_misc.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
@@ -123,6 +124,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
123 } 124 }
124 125
125 tsk->thread.fault_address = addr; 126 tsk->thread.fault_address = addr;
127 tsk->thread.fault_code = esr;
126 si.si_signo = sig; 128 si.si_signo = sig;
127 si.si_errno = 0; 129 si.si_errno = 0;
128 si.si_code = code; 130 si.si_code = code;
@@ -148,8 +150,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
148#define VM_FAULT_BADMAP 0x010000 150#define VM_FAULT_BADMAP 0x010000
149#define VM_FAULT_BADACCESS 0x020000 151#define VM_FAULT_BADACCESS 0x020000
150 152
151#define ESR_WRITE (1 << 6)
152#define ESR_CM (1 << 8)
153#define ESR_LNX_EXEC (1 << 24) 153#define ESR_LNX_EXEC (1 << 24)
154 154
155static int __do_page_fault(struct mm_struct *mm, unsigned long addr, 155static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
@@ -218,7 +218,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
218 218
219 if (esr & ESR_LNX_EXEC) { 219 if (esr & ESR_LNX_EXEC) {
220 vm_flags = VM_EXEC; 220 vm_flags = VM_EXEC;
221 } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { 221 } else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) {
222 vm_flags = VM_WRITE; 222 vm_flags = VM_WRITE;
223 mm_flags |= FAULT_FLAG_WRITE; 223 mm_flags |= FAULT_FLAG_WRITE;
224 } 224 }
@@ -525,7 +525,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
525 info.si_errno = 0; 525 info.si_errno = 0;
526 info.si_code = inf->code; 526 info.si_code = inf->code;
527 info.si_addr = (void __user *)addr; 527 info.si_addr = (void __user *)addr;
528 arm64_notify_die("", regs, &info, esr); 528 arm64_notify_die("", regs, &info, 0);
529 529
530 return 0; 530 return 0;
531} 531}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4a829a210bb6..c43f1dd19489 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -43,11 +43,6 @@
43struct page *empty_zero_page; 43struct page *empty_zero_page;
44EXPORT_SYMBOL(empty_zero_page); 44EXPORT_SYMBOL(empty_zero_page);
45 45
46pgprot_t pgprot_default;
47EXPORT_SYMBOL(pgprot_default);
48
49static pmdval_t prot_sect_kernel;
50
51struct cachepolicy { 46struct cachepolicy {
52 const char policy[16]; 47 const char policy[16];
53 u64 mair; 48 u64 mair;
@@ -122,33 +117,6 @@ static int __init early_cachepolicy(char *p)
122} 117}
123early_param("cachepolicy", early_cachepolicy); 118early_param("cachepolicy", early_cachepolicy);
124 119
125/*
126 * Adjust the PMD section entries according to the CPU in use.
127 */
128void __init init_mem_pgprot(void)
129{
130 pteval_t default_pgprot;
131 int i;
132
133 default_pgprot = PTE_ATTRINDX(MT_NORMAL);
134 prot_sect_kernel = PMD_TYPE_SECT | PMD_SECT_AF | PMD_ATTRINDX(MT_NORMAL);
135
136#ifdef CONFIG_SMP
137 /*
138 * Mark memory with the "shared" attribute for SMP systems
139 */
140 default_pgprot |= PTE_SHARED;
141 prot_sect_kernel |= PMD_SECT_S;
142#endif
143
144 for (i = 0; i < 16; i++) {
145 unsigned long v = pgprot_val(protection_map[i]);
146 protection_map[i] = __pgprot(v | default_pgprot);
147 }
148
149 pgprot_default = __pgprot(PTE_TYPE_PAGE | PTE_AF | default_pgprot);
150}
151
152pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 120pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
153 unsigned long size, pgprot_t vma_prot) 121 unsigned long size, pgprot_t vma_prot)
154{ 122{
@@ -196,11 +164,10 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
196 pgprot_t prot_pte; 164 pgprot_t prot_pte;
197 165
198 if (map_io) { 166 if (map_io) {
199 prot_sect = PMD_TYPE_SECT | PMD_SECT_AF | 167 prot_sect = PROT_SECT_DEVICE_nGnRE;
200 PMD_ATTRINDX(MT_DEVICE_nGnRE);
201 prot_pte = __pgprot(PROT_DEVICE_nGnRE); 168 prot_pte = __pgprot(PROT_DEVICE_nGnRE);
202 } else { 169 } else {
203 prot_sect = prot_sect_kernel; 170 prot_sect = PROT_SECT_NORMAL_EXEC;
204 prot_pte = PAGE_KERNEL_EXEC; 171 prot_pte = PAGE_KERNEL_EXEC;
205 } 172 }
206 173
@@ -242,7 +209,30 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
242 209
243 do { 210 do {
244 next = pud_addr_end(addr, end); 211 next = pud_addr_end(addr, end);
245 alloc_init_pmd(pud, addr, next, phys, map_io); 212
213 /*
214 * For 4K granule only, attempt to put down a 1GB block
215 */
216 if (!map_io && (PAGE_SHIFT == 12) &&
217 ((addr | next | phys) & ~PUD_MASK) == 0) {
218 pud_t old_pud = *pud;
219 set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC));
220
221 /*
222 * If we have an old value for a pud, it will
223 * be pointing to a pmd table that we no longer
224 * need (from swapper_pg_dir).
225 *
226 * Look up the old pmd table and free it.
227 */
228 if (!pud_none(old_pud)) {
229 phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
230 memblock_free(table, PAGE_SIZE);
231 flush_tlb_all();
232 }
233 } else {
234 alloc_init_pmd(pud, addr, next, phys, map_io);
235 }
246 phys += next - addr; 236 phys += next - addr;
247 } while (pud++, addr = next, addr != end); 237 } while (pud++, addr = next, addr != end);
248} 238}
@@ -399,6 +389,9 @@ int kern_addr_valid(unsigned long addr)
399 if (pud_none(*pud)) 389 if (pud_none(*pud))
400 return 0; 390 return 0;
401 391
392 if (pud_sect(*pud))
393 return pfn_valid(pud_pfn(*pud));
394
402 pmd = pmd_offset(pud, addr); 395 pmd = pmd_offset(pud, addr);
403 if (pmd_none(*pmd)) 396 if (pmd_none(*pmd))
404 return 0; 397 return 0;
@@ -446,7 +439,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
446 if (!p) 439 if (!p)
447 return -ENOMEM; 440 return -ENOMEM;
448 441
449 set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel)); 442 set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL));
450 } else 443 } else
451 vmemmap_verify((pte_t *)pmd, node, addr, next); 444 vmemmap_verify((pte_t *)pmd, node, addr, next);
452 } while (addr = next, addr != end); 445 } while (addr = next, addr != end);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9042aff5e9e3..7736779c9809 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -182,7 +182,7 @@ ENDPROC(cpu_do_switch_mm)
182ENTRY(__cpu_setup) 182ENTRY(__cpu_setup)
183 ic iallu // I+BTB cache invalidate 183 ic iallu // I+BTB cache invalidate
184 tlbi vmalle1is // invalidate I + D TLBs 184 tlbi vmalle1is // invalidate I + D TLBs
185 dsb sy 185 dsb ish
186 186
187 mov x0, #3 << 20 187 mov x0, #3 << 20
188 msr cpacr_el1, x0 // Enable FP/ASIMD 188 msr cpacr_el1, x0 // Enable FP/ASIMD
diff --git a/arch/arm64/mm/tlb.S b/arch/arm64/mm/tlb.S
deleted file mode 100644
index 19da91e0cd27..000000000000
--- a/arch/arm64/mm/tlb.S
+++ /dev/null
@@ -1,71 +0,0 @@
1/*
2 * Based on arch/arm/mm/tlb.S
3 *
4 * Copyright (C) 1997-2002 Russell King
5 * Copyright (C) 2012 ARM Ltd.
6 * Written by Catalin Marinas <catalin.marinas@arm.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20#include <linux/linkage.h>
21#include <asm/assembler.h>
22#include <asm/asm-offsets.h>
23#include <asm/page.h>
24#include <asm/tlbflush.h>
25#include "proc-macros.S"
26
27/*
28 * __cpu_flush_user_tlb_range(start, end, vma)
29 *
30 * Invalidate a range of TLB entries in the specified address space.
31 *
32 * - start - start address (may not be aligned)
33 * - end - end address (exclusive, may not be aligned)
34 * - vma - vma_struct describing address range
35 */
36ENTRY(__cpu_flush_user_tlb_range)
37 vma_vm_mm x3, x2 // get vma->vm_mm
38 mmid w3, x3 // get vm_mm->context.id
39 dsb sy
40 lsr x0, x0, #12 // align address
41 lsr x1, x1, #12
42 bfi x0, x3, #48, #16 // start VA and ASID
43 bfi x1, x3, #48, #16 // end VA and ASID
441: tlbi vae1is, x0 // TLB invalidate by address and ASID
45 add x0, x0, #1
46 cmp x0, x1
47 b.lo 1b
48 dsb sy
49 ret
50ENDPROC(__cpu_flush_user_tlb_range)
51
52/*
53 * __cpu_flush_kern_tlb_range(start,end)
54 *
55 * Invalidate a range of kernel TLB entries.
56 *
57 * - start - start address (may not be aligned)
58 * - end - end address (exclusive, may not be aligned)
59 */
60ENTRY(__cpu_flush_kern_tlb_range)
61 dsb sy
62 lsr x0, x0, #12 // align address
63 lsr x1, x1, #12
641: tlbi vaae1is, x0 // TLB invalidate by address
65 add x0, x0, #1
66 cmp x0, x1
67 b.lo 1b
68 dsb sy
69 isb
70 ret
71ENDPROC(__cpu_flush_kern_tlb_range)
diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h
index 8a029505d7b7..2f1c3c2657ad 100644
--- a/arch/blackfin/include/asm/ftrace.h
+++ b/arch/blackfin/include/asm/ftrace.h
@@ -66,16 +66,7 @@ extern inline void *return_address(unsigned int level)
66 66
67#endif /* CONFIG_FRAME_POINTER */ 67#endif /* CONFIG_FRAME_POINTER */
68 68
69#define HAVE_ARCH_CALLER_ADDR 69#define ftrace_return_address(n) return_address(n)
70
71/* inline function or macro may lead to unexpected result */
72#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
73#define CALLER_ADDR1 ((unsigned long)return_address(1))
74#define CALLER_ADDR2 ((unsigned long)return_address(2))
75#define CALLER_ADDR3 ((unsigned long)return_address(3))
76#define CALLER_ADDR4 ((unsigned long)return_address(4))
77#define CALLER_ADDR5 ((unsigned long)return_address(5))
78#define CALLER_ADDR6 ((unsigned long)return_address(6))
79 70
80#endif /* __ASSEMBLY__ */ 71#endif /* __ASSEMBLY__ */
81 72
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
index 72c0fafaa039..544ed8ef87eb 100644
--- a/arch/parisc/include/asm/ftrace.h
+++ b/arch/parisc/include/asm/ftrace.h
@@ -24,15 +24,7 @@ extern void return_to_handler(void);
24 24
25extern unsigned long return_address(unsigned int); 25extern unsigned long return_address(unsigned int);
26 26
27#define HAVE_ARCH_CALLER_ADDR 27#define ftrace_return_address(n) return_address(n)
28
29#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
30#define CALLER_ADDR1 return_address(1)
31#define CALLER_ADDR2 return_address(2)
32#define CALLER_ADDR3 return_address(3)
33#define CALLER_ADDR4 return_address(4)
34#define CALLER_ADDR5 return_address(5)
35#define CALLER_ADDR6 return_address(6)
36 28
37#endif /* __ASSEMBLY__ */ 29#endif /* __ASSEMBLY__ */
38 30
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index 13e9966464c2..e79fb6ebaa42 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -40,15 +40,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
40/* arch/sh/kernel/return_address.c */ 40/* arch/sh/kernel/return_address.c */
41extern void *return_address(unsigned int); 41extern void *return_address(unsigned int);
42 42
43#define HAVE_ARCH_CALLER_ADDR 43#define ftrace_return_address(n) return_address(n)
44
45#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
46#define CALLER_ADDR1 ((unsigned long)return_address(1))
47#define CALLER_ADDR2 ((unsigned long)return_address(2))
48#define CALLER_ADDR3 ((unsigned long)return_address(3))
49#define CALLER_ADDR4 ((unsigned long)return_address(4))
50#define CALLER_ADDR5 ((unsigned long)return_address(5))
51#define CALLER_ADDR6 ((unsigned long)return_address(6))
52 44
53#endif /* __ASSEMBLY__ */ 45#endif /* __ASSEMBLY__ */
54 46
diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h
index 736b9d214d80..6c6d9a9f185f 100644
--- a/arch/xtensa/include/asm/ftrace.h
+++ b/arch/xtensa/include/asm/ftrace.h
@@ -12,24 +12,18 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14 14
15#define HAVE_ARCH_CALLER_ADDR
16#ifndef __ASSEMBLY__ 15#ifndef __ASSEMBLY__
17#define CALLER_ADDR0 ({ unsigned long a0, a1; \ 16#define ftrace_return_address0 ({ unsigned long a0, a1; \
18 __asm__ __volatile__ ( \ 17 __asm__ __volatile__ ( \
19 "mov %0, a0\n" \ 18 "mov %0, a0\n" \
20 "mov %1, a1\n" \ 19 "mov %1, a1\n" \
21 : "=r"(a0), "=r"(a1)); \ 20 : "=r"(a0), "=r"(a1)); \
22 MAKE_PC_FROM_RA(a0, a1); }) 21 MAKE_PC_FROM_RA(a0, a1); })
22
23#ifdef CONFIG_FRAME_POINTER 23#ifdef CONFIG_FRAME_POINTER
24extern unsigned long return_address(unsigned level); 24extern unsigned long return_address(unsigned level);
25#define CALLER_ADDR1 return_address(1) 25#define ftrace_return_address(n) return_address(n)
26#define CALLER_ADDR2 return_address(2) 26#endif
27#define CALLER_ADDR3 return_address(3)
28#else /* CONFIG_FRAME_POINTER */
29#define CALLER_ADDR1 (0)
30#define CALLER_ADDR2 (0)
31#define CALLER_ADDR3 (0)
32#endif /* CONFIG_FRAME_POINTER */
33#endif /* __ASSEMBLY__ */ 27#endif /* __ASSEMBLY__ */
34 28
35#ifdef CONFIG_FUNCTION_TRACER 29#ifdef CONFIG_FUNCTION_TRACER
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index 03cf5936bad6..1ac097279db1 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -4,22 +4,27 @@
4/* 4/*
5 * This is the most generic implementation of unaligned accesses 5 * This is the most generic implementation of unaligned accesses
6 * and should work almost anywhere. 6 * and should work almost anywhere.
7 *
8 * If an architecture can handle unaligned accesses in hardware,
9 * it may want to use the linux/unaligned/access_ok.h implementation
10 * instead.
11 */ 7 */
12#include <asm/byteorder.h> 8#include <asm/byteorder.h>
13 9
10/* Set by the arch if it can handle unaligned accesses in hardware. */
11#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
12# include <linux/unaligned/access_ok.h>
13#endif
14
14#if defined(__LITTLE_ENDIAN) 15#if defined(__LITTLE_ENDIAN)
15# include <linux/unaligned/le_struct.h> 16# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
16# include <linux/unaligned/be_byteshift.h> 17# include <linux/unaligned/le_struct.h>
18# include <linux/unaligned/be_byteshift.h>
19# endif
17# include <linux/unaligned/generic.h> 20# include <linux/unaligned/generic.h>
18# define get_unaligned __get_unaligned_le 21# define get_unaligned __get_unaligned_le
19# define put_unaligned __put_unaligned_le 22# define put_unaligned __put_unaligned_le
20#elif defined(__BIG_ENDIAN) 23#elif defined(__BIG_ENDIAN)
21# include <linux/unaligned/be_struct.h> 24# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
22# include <linux/unaligned/le_byteshift.h> 25# include <linux/unaligned/be_struct.h>
26# include <linux/unaligned/le_byteshift.h>
27# endif
23# include <linux/unaligned/generic.h> 28# include <linux/unaligned/generic.h>
24# define get_unaligned __get_unaligned_be 29# define get_unaligned __get_unaligned_be
25# define put_unaligned __put_unaligned_be 30# define put_unaligned __put_unaligned_be
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ae9504b4b67d..2018751cad9e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -616,25 +616,27 @@ static inline void __ftrace_enabled_restore(int enabled)
616#endif 616#endif
617} 617}
618 618
619#ifndef HAVE_ARCH_CALLER_ADDR 619/* All archs should have this, but we define it for consistency */
620#ifndef ftrace_return_address0
621# define ftrace_return_address0 __builtin_return_address(0)
622#endif
623
624/* Archs may use other ways for ADDR1 and beyond */
625#ifndef ftrace_return_address
620# ifdef CONFIG_FRAME_POINTER 626# ifdef CONFIG_FRAME_POINTER
621# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 627# define ftrace_return_address(n) __builtin_return_address(n)
622# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
623# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
624# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
625# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
626# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
627# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
628# else 628# else
629# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 629# define ftrace_return_address(n) 0UL
630# define CALLER_ADDR1 0UL
631# define CALLER_ADDR2 0UL
632# define CALLER_ADDR3 0UL
633# define CALLER_ADDR4 0UL
634# define CALLER_ADDR5 0UL
635# define CALLER_ADDR6 0UL
636# endif 630# endif
637#endif /* ifndef HAVE_ARCH_CALLER_ADDR */ 631#endif
632
633#define CALLER_ADDR0 ((unsigned long)ftrace_return_address0)
634#define CALLER_ADDR1 ((unsigned long)ftrace_return_address(1))
635#define CALLER_ADDR2 ((unsigned long)ftrace_return_address(2))
636#define CALLER_ADDR3 ((unsigned long)ftrace_return_address(3))
637#define CALLER_ADDR4 ((unsigned long)ftrace_return_address(4))
638#define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
639#define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
638 640
639#ifdef CONFIG_IRQSOFF_TRACER 641#ifdef CONFIG_IRQSOFF_TRACER
640 extern void time_hardirqs_on(unsigned long a0, unsigned long a1); 642 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 9c22317778eb..e11aa4a156d2 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -40,6 +40,11 @@
40#define R_METAG_NONE 3 40#define R_METAG_NONE 3
41#endif 41#endif
42 42
43#ifndef EM_AARCH64
44#define EM_AARCH64 183
45#define R_AARCH64_ABS64 257
46#endif
47
43static int fd_map; /* File descriptor for file being modified. */ 48static int fd_map; /* File descriptor for file being modified. */
44static int mmap_failed; /* Boolean flag. */ 49static int mmap_failed; /* Boolean flag. */
45static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */ 50static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */
@@ -347,6 +352,8 @@ do_file(char const *const fname)
347 case EM_ARM: reltype = R_ARM_ABS32; 352 case EM_ARM: reltype = R_ARM_ABS32;
348 altmcount = "__gnu_mcount_nc"; 353 altmcount = "__gnu_mcount_nc";
349 break; 354 break;
355 case EM_AARCH64:
356 reltype = R_AARCH64_ABS64; gpfx = '_'; break;
350 case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; 357 case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break;
351 case EM_METAG: reltype = R_METAG_ADDR32; 358 case EM_METAG: reltype = R_METAG_ADDR32;
352 altmcount = "_mcount_wrapper"; 359 altmcount = "_mcount_wrapper";
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 91280b82da08..397b6b84e8c5 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -279,6 +279,11 @@ if ($arch eq "x86_64") {
279 $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" . 279 $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" .
280 "\\s+(__gnu_mcount_nc|mcount)\$"; 280 "\\s+(__gnu_mcount_nc|mcount)\$";
281 281
282} elsif ($arch eq "arm64") {
283 $alignment = 3;
284 $section_type = '%progbits';
285 $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_AARCH64_CALL26\\s+_mcount\$";
286 $type = ".quad";
282} elsif ($arch eq "ia64") { 287} elsif ($arch eq "ia64") {
283 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; 288 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$";
284 $type = "data8"; 289 $type = "data8";