aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/arm/Kconfig36
-rw-r--r--arch/arm/Kconfig.debug41
-rw-r--r--arch/arm/Makefile1
-rw-r--r--arch/arm/boot/compressed/head.S9
-rw-r--r--arch/arm/common/Makefile2
-rw-r--r--arch/arm/common/bL_switcher.c822
-rw-r--r--arch/arm/common/bL_switcher_dummy_if.c71
-rw-r--r--arch/arm/common/mcpm_entry.c27
-rw-r--r--arch/arm/common/mcpm_head.S18
-rw-r--r--arch/arm/common/mcpm_platsmp.c27
-rw-r--r--arch/arm/common/timer-sp.c2
-rw-r--r--arch/arm/configs/h3600_defconfig22
-rw-r--r--arch/arm/crypto/.gitignore1
-rw-r--r--arch/arm/crypto/Makefile14
-rw-r--r--arch/arm/crypto/aes_glue.c22
-rw-r--r--arch/arm/crypto/aes_glue.h19
-rw-r--r--arch/arm/crypto/aesbs-core.S_shipped2544
-rw-r--r--arch/arm/crypto/aesbs-glue.c434
-rw-r--r--arch/arm/crypto/bsaes-armv7.pl2467
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/assembler.h7
-rw-r--r--arch/arm/include/asm/atomic.h108
-rw-r--r--arch/arm/include/asm/bL_switcher.h77
-rw-r--r--arch/arm/include/asm/bug.h10
-rw-r--r--arch/arm/include/asm/cacheflush.h46
-rw-r--r--arch/arm/include/asm/cmpxchg.h58
-rw-r--r--arch/arm/include/asm/cputype.h1
-rw-r--r--arch/arm/include/asm/hardirq.h2
-rw-r--r--arch/arm/include/asm/hardware/coresight.h8
-rw-r--r--arch/arm/include/asm/kgdb.h3
-rw-r--r--arch/arm/include/asm/mach/arch.h1
-rw-r--r--arch/arm/include/asm/mcpm.h39
-rw-r--r--arch/arm/include/asm/memory.h76
-rw-r--r--arch/arm/include/asm/mmu.h2
-rw-r--r--arch/arm/include/asm/pgtable-2level.h7
-rw-r--r--arch/arm/include/asm/pgtable-3level.h3
-rw-r--r--arch/arm/include/asm/processor.h33
-rw-r--r--arch/arm/include/asm/setup.h2
-rw-r--r--arch/arm/include/asm/smp.h2
-rw-r--r--arch/arm/include/asm/spinlock.h36
-rw-r--r--arch/arm/include/asm/spinlock_types.h2
-rw-r--r--arch/arm/include/asm/tlbflush.h48
-rw-r--r--arch/arm/include/asm/unified.h4
-rw-r--r--arch/arm/include/debug/efm32.S45
-rw-r--r--arch/arm/include/debug/msm.S5
-rw-r--r--arch/arm/include/debug/pl01x.S2
-rw-r--r--arch/arm/include/uapi/asm/Kbuild1
-rw-r--r--arch/arm/include/uapi/asm/perf_regs.h23
-rw-r--r--arch/arm/kernel/Makefile4
-rw-r--r--arch/arm/kernel/armksyms.c1
-rw-r--r--arch/arm/kernel/entry-armv.S6
-rw-r--r--arch/arm/kernel/entry-common.S4
-rw-r--r--arch/arm/kernel/head.S82
-rw-r--r--arch/arm/kernel/hw_breakpoint.c14
-rw-r--r--arch/arm/kernel/kprobes.c8
-rw-r--r--arch/arm/kernel/module.c57
-rw-r--r--arch/arm/kernel/perf_event.c3
-rw-r--r--arch/arm/kernel/perf_event_cpu.c2
-rw-r--r--arch/arm/kernel/perf_regs.c30
-rw-r--r--arch/arm/kernel/setup.c28
-rw-r--r--arch/arm/kernel/signal.c38
-rw-r--r--arch/arm/kernel/sigreturn_codes.S80
-rw-r--r--arch/arm/kernel/sleep.S27
-rw-r--r--arch/arm/kernel/smp.c42
-rw-r--r--arch/arm/kernel/smp_scu.c14
-rw-r--r--arch/arm/kernel/smp_tlb.c36
-rw-r--r--arch/arm/kernel/smp_twd.c24
-rw-r--r--arch/arm/kernel/suspend.c8
-rw-r--r--arch/arm/kernel/traps.c24
-rw-r--r--arch/arm/kvm/arm.c6
-rw-r--r--arch/arm/lib/bitops.h5
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c41
-rw-r--r--arch/arm/mach-footbridge/netwinder-hw.c8
-rw-r--r--arch/arm/mach-highbank/Kconfig3
-rw-r--r--arch/arm/mach-ixp4xx/Kconfig4
-rw-r--r--arch/arm/mach-mvebu/Kconfig1
-rw-r--r--arch/arm/mach-mvebu/coherency_ll.S3
-rw-r--r--arch/arm/mach-mvebu/headsmp.S4
-rw-r--r--arch/arm/mach-sa1100/assabet.c3
-rw-r--r--arch/arm/mach-sa1100/include/mach/gpio.h55
-rw-r--r--arch/arm/mach-sa1100/include/mach/h3xxx.h2
-rw-r--r--arch/arm/mach-sa1100/simpad.c1
-rw-r--r--arch/arm/mach-tegra/Kconfig2
-rw-r--r--arch/arm/mach-vexpress/Kconfig1
-rw-r--r--arch/arm/mach-vexpress/dcscb.c56
-rw-r--r--arch/arm/mach-vexpress/tc2_pm.c48
-rw-r--r--arch/arm/mm/Kconfig6
-rw-r--r--arch/arm/mm/abort-ev6.S5
-rw-r--r--arch/arm/mm/alignment.c9
-rw-r--r--arch/arm/mm/dma-mapping.c4
-rw-r--r--arch/arm/mm/extable.c7
-rw-r--r--arch/arm/mm/idmap.c8
-rw-r--r--arch/arm/mm/mmap.c6
-rw-r--r--arch/arm/mm/mmu.c82
-rw-r--r--arch/arm/mm/nommu.c9
-rw-r--r--arch/arm/mm/proc-v6.S4
-rw-r--r--arch/arm/mm/proc-v7.S4
-rw-r--r--arch/arm/net/bpf_jit_32.c6
-rw-r--r--arch/arm/plat-versatile/headsmp.S2
-rw-r--r--arch/arm/vfp/vfpmodule.c6
-rw-r--r--arch/arm64/include/asm/atomic.h14
-rw-r--r--arch/arm64/kernel/debug-monitors.c13
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c22
-rw-r--r--arch/arm64/kernel/perf_event.c4
-rw-r--r--crypto/Kconfig16
-rw-r--r--drivers/bus/arm-cci.c6
-rw-r--r--drivers/gpio/gpio-sa1100.c2
-rw-r--r--drivers/irqchip/irq-gic.c151
-rw-r--r--drivers/mmc/host/mmci.c78
-rw-r--r--drivers/mmc/host/mmci.h4
-rw-r--r--include/linux/amba/bus.h2
-rw-r--r--include/linux/irqchip/arm-gic.h7
-rw-r--r--include/trace/events/power_cpu_migrate.h67
-rw-r--r--tools/perf/arch/arm/Makefile3
-rw-r--r--tools/perf/arch/arm/include/perf_regs.h54
-rw-r--r--tools/perf/arch/arm/util/unwind.c48
-rw-r--r--tools/perf/config/Makefile14
-rw-r--r--tools/perf/config/feature-checks/Makefile1
-rw-r--r--tools/perf/config/feature-checks/test-all.c4
-rw-r--r--tools/perf/config/feature-checks/test-libunwind-debug-frame.c16
-rw-r--r--tools/perf/util/unwind.c75
121 files changed, 8137 insertions, 628 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index acb80708accd..603d661b445d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
5 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 5 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 6 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
7 select ARCH_HAVE_CUSTOM_GPIO_H 7 select ARCH_HAVE_CUSTOM_GPIO_H
8 select ARCH_USE_CMPXCHG_LOCKREF
8 select ARCH_WANT_IPC_PARSE_VERSION 9 select ARCH_WANT_IPC_PARSE_VERSION
9 select BUILDTIME_EXTABLE_SORT if MMU 10 select BUILDTIME_EXTABLE_SORT if MMU
10 select CLONE_BACKWARDS 11 select CLONE_BACKWARDS
@@ -51,6 +52,8 @@ config ARM
51 select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND 52 select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
52 select HAVE_OPROFILE if (HAVE_PERF_EVENTS) 53 select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
53 select HAVE_PERF_EVENTS 54 select HAVE_PERF_EVENTS
55 select HAVE_PERF_REGS
56 select HAVE_PERF_USER_STACK_DUMP
54 select HAVE_REGS_AND_STACK_ACCESS_API 57 select HAVE_REGS_AND_STACK_ACCESS_API
55 select HAVE_SYSCALL_TRACEPOINTS 58 select HAVE_SYSCALL_TRACEPOINTS
56 select HAVE_UID16 59 select HAVE_UID16
@@ -481,6 +484,7 @@ config ARCH_IXP4XX
481 bool "IXP4xx-based" 484 bool "IXP4xx-based"
482 depends on MMU 485 depends on MMU
483 select ARCH_HAS_DMA_SET_COHERENT_MASK 486 select ARCH_HAS_DMA_SET_COHERENT_MASK
487 select ARCH_SUPPORTS_BIG_ENDIAN
484 select ARCH_REQUIRE_GPIOLIB 488 select ARCH_REQUIRE_GPIOLIB
485 select CLKSRC_MMIO 489 select CLKSRC_MMIO
486 select CPU_XSCALE 490 select CPU_XSCALE
@@ -688,7 +692,6 @@ config ARCH_SA1100
688 select GENERIC_CLOCKEVENTS 692 select GENERIC_CLOCKEVENTS
689 select HAVE_IDE 693 select HAVE_IDE
690 select ISA 694 select ISA
691 select NEED_MACH_GPIO_H
692 select NEED_MACH_MEMORY_H 695 select NEED_MACH_MEMORY_H
693 select SPARSE_IRQ 696 select SPARSE_IRQ
694 help 697 help
@@ -1064,11 +1067,6 @@ config IWMMXT
1064 Enable support for iWMMXt context switching at run time if 1067 Enable support for iWMMXt context switching at run time if
1065 running on a CPU that supports it. 1068 running on a CPU that supports it.
1066 1069
1067config XSCALE_PMU
1068 bool
1069 depends on CPU_XSCALE
1070 default y
1071
1072config MULTI_IRQ_HANDLER 1070config MULTI_IRQ_HANDLER
1073 bool 1071 bool
1074 help 1072 help
@@ -1516,6 +1514,32 @@ config MCPM
1516 for (multi-)cluster based systems, such as big.LITTLE based 1514 for (multi-)cluster based systems, such as big.LITTLE based
1517 systems. 1515 systems.
1518 1516
1517config BIG_LITTLE
1518 bool "big.LITTLE support (Experimental)"
1519 depends on CPU_V7 && SMP
1520 select MCPM
1521 help
1522 This option enables support selections for the big.LITTLE
1523 system architecture.
1524
1525config BL_SWITCHER
1526 bool "big.LITTLE switcher support"
1527 depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
1528 select CPU_PM
1529 select ARM_CPU_SUSPEND
1530 help
1531 The big.LITTLE "switcher" provides the core functionality to
1532 transparently handle transition between a cluster of A15's
1533 and a cluster of A7's in a big.LITTLE system.
1534
1535config BL_SWITCHER_DUMMY_IF
1536 tristate "Simple big.LITTLE switcher user interface"
1537 depends on BL_SWITCHER && DEBUG_KERNEL
1538 help
1539 This is a simple and dummy char dev interface to control
1540 the big.LITTLE switcher core code. It is meant for
1541 debugging purposes only.
1542
1519choice 1543choice
1520 prompt "Memory split" 1544 prompt "Memory split"
1521 default VMSPLIT_3G 1545 default VMSPLIT_3G
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index d597c6b8488b..5765abf5ce84 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -318,6 +318,7 @@ choice
318 config DEBUG_MSM_UART1 318 config DEBUG_MSM_UART1
319 bool "Kernel low-level debugging messages via MSM UART1" 319 bool "Kernel low-level debugging messages via MSM UART1"
320 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50 320 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
321 select DEBUG_MSM_UART
321 help 322 help
322 Say Y here if you want the debug print routines to direct 323 Say Y here if you want the debug print routines to direct
323 their output to the first serial port on MSM devices. 324 their output to the first serial port on MSM devices.
@@ -325,6 +326,7 @@ choice
325 config DEBUG_MSM_UART2 326 config DEBUG_MSM_UART2
326 bool "Kernel low-level debugging messages via MSM UART2" 327 bool "Kernel low-level debugging messages via MSM UART2"
327 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50 328 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
329 select DEBUG_MSM_UART
328 help 330 help
329 Say Y here if you want the debug print routines to direct 331 Say Y here if you want the debug print routines to direct
330 their output to the second serial port on MSM devices. 332 their output to the second serial port on MSM devices.
@@ -332,6 +334,7 @@ choice
332 config DEBUG_MSM_UART3 334 config DEBUG_MSM_UART3
333 bool "Kernel low-level debugging messages via MSM UART3" 335 bool "Kernel low-level debugging messages via MSM UART3"
334 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50 336 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
337 select DEBUG_MSM_UART
335 help 338 help
336 Say Y here if you want the debug print routines to direct 339 Say Y here if you want the debug print routines to direct
337 their output to the third serial port on MSM devices. 340 their output to the third serial port on MSM devices.
@@ -340,6 +343,7 @@ choice
340 bool "Kernel low-level debugging messages via MSM 8660 UART" 343 bool "Kernel low-level debugging messages via MSM 8660 UART"
341 depends on ARCH_MSM8X60 344 depends on ARCH_MSM8X60
342 select MSM_HAS_DEBUG_UART_HS 345 select MSM_HAS_DEBUG_UART_HS
346 select DEBUG_MSM_UART
343 help 347 help
344 Say Y here if you want the debug print routines to direct 348 Say Y here if you want the debug print routines to direct
345 their output to the serial port on MSM 8660 devices. 349 their output to the serial port on MSM 8660 devices.
@@ -348,10 +352,20 @@ choice
348 bool "Kernel low-level debugging messages via MSM 8960 UART" 352 bool "Kernel low-level debugging messages via MSM 8960 UART"
349 depends on ARCH_MSM8960 353 depends on ARCH_MSM8960
350 select MSM_HAS_DEBUG_UART_HS 354 select MSM_HAS_DEBUG_UART_HS
355 select DEBUG_MSM_UART
351 help 356 help
352 Say Y here if you want the debug print routines to direct 357 Say Y here if you want the debug print routines to direct
353 their output to the serial port on MSM 8960 devices. 358 their output to the serial port on MSM 8960 devices.
354 359
360 config DEBUG_MSM8974_UART
361 bool "Kernel low-level debugging messages via MSM 8974 UART"
362 depends on ARCH_MSM8974
363 select MSM_HAS_DEBUG_UART_HS
364 select DEBUG_MSM_UART
365 help
366 Say Y here if you want the debug print routines to direct
367 their output to the serial port on MSM 8974 devices.
368
355 config DEBUG_MVEBU_UART 369 config DEBUG_MVEBU_UART
356 bool "Kernel low-level debugging messages via MVEBU UART (old bootloaders)" 370 bool "Kernel low-level debugging messages via MVEBU UART (old bootloaders)"
357 depends on ARCH_MVEBU 371 depends on ARCH_MVEBU
@@ -841,6 +855,20 @@ choice
841 options; the platform specific options are deprecated 855 options; the platform specific options are deprecated
842 and will be soon removed. 856 and will be soon removed.
843 857
858 config DEBUG_LL_UART_EFM32
859 bool "Kernel low-level debugging via efm32 UART"
860 depends on ARCH_EFM32
861 help
862 Say Y here if you want the debug print routines to direct
863 their output to an UART or USART port on efm32 based
864 machines. Use the following addresses for DEBUG_UART_PHYS:
865
866 0x4000c000 | USART0
867 0x4000c400 | USART1
868 0x4000c800 | USART2
869 0x4000e000 | UART0
870 0x4000e400 | UART1
871
844 config DEBUG_LL_UART_PL01X 872 config DEBUG_LL_UART_PL01X
845 bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART" 873 bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART"
846 help 874 help
@@ -887,11 +915,16 @@ config DEBUG_STI_UART
887 bool 915 bool
888 depends on ARCH_STI 916 depends on ARCH_STI
889 917
918config DEBUG_MSM_UART
919 bool
920 depends on ARCH_MSM
921
890config DEBUG_LL_INCLUDE 922config DEBUG_LL_INCLUDE
891 string 923 string
892 default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250 924 default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250
893 default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X 925 default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X
894 default "debug/exynos.S" if DEBUG_EXYNOS_UART 926 default "debug/exynos.S" if DEBUG_EXYNOS_UART
927 default "debug/efm32.S" if DEBUG_LL_UART_EFM32
895 default "debug/icedcc.S" if DEBUG_ICEDCC 928 default "debug/icedcc.S" if DEBUG_ICEDCC
896 default "debug/imx.S" if DEBUG_IMX1_UART || \ 929 default "debug/imx.S" if DEBUG_IMX1_UART || \
897 DEBUG_IMX25_UART || \ 930 DEBUG_IMX25_UART || \
@@ -902,11 +935,7 @@ config DEBUG_LL_INCLUDE
902 DEBUG_IMX53_UART ||\ 935 DEBUG_IMX53_UART ||\
903 DEBUG_IMX6Q_UART || \ 936 DEBUG_IMX6Q_UART || \
904 DEBUG_IMX6SL_UART 937 DEBUG_IMX6SL_UART
905 default "debug/msm.S" if DEBUG_MSM_UART1 || \ 938 default "debug/msm.S" if DEBUG_MSM_UART
906 DEBUG_MSM_UART2 || \
907 DEBUG_MSM_UART3 || \
908 DEBUG_MSM8660_UART || \
909 DEBUG_MSM8960_UART
910 default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART 939 default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART
911 default "debug/sirf.S" if DEBUG_SIRFPRIMA2_UART1 || DEBUG_SIRFMARCO_UART1 940 default "debug/sirf.S" if DEBUG_SIRFPRIMA2_UART1 || DEBUG_SIRFMARCO_UART1
912 default "debug/sti.S" if DEBUG_STI_UART 941 default "debug/sti.S" if DEBUG_STI_UART
@@ -959,6 +988,7 @@ config DEBUG_UART_PHYS
959 default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2 988 default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2
960 default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3 989 default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3
961 default 0x20201000 if DEBUG_BCM2835 990 default 0x20201000 if DEBUG_BCM2835
991 default 0x4000e400 if DEBUG_LL_UART_EFM32
962 default 0x40090000 if ARCH_LPC32XX 992 default 0x40090000 if ARCH_LPC32XX
963 default 0x40100000 if DEBUG_PXA_UART1 993 default 0x40100000 if DEBUG_PXA_UART1
964 default 0x42000000 if ARCH_GEMINI 994 default 0x42000000 if ARCH_GEMINI
@@ -989,6 +1019,7 @@ config DEBUG_UART_PHYS
989 default 0xfff36000 if DEBUG_HIGHBANK_UART 1019 default 0xfff36000 if DEBUG_HIGHBANK_UART
990 default 0xfffff700 if ARCH_IOP33X 1020 default 0xfffff700 if ARCH_IOP33X
991 depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \ 1021 depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
1022 DEBUG_LL_UART_EFM32 || \
992 DEBUG_UART_8250 || DEBUG_UART_PL01X 1023 DEBUG_UART_8250 || DEBUG_UART_PL01X
993 1024
994config DEBUG_UART_VIRT 1025config DEBUG_UART_VIRT
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 8b667132d7b4..c99b1086d83d 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -16,6 +16,7 @@ LDFLAGS :=
16LDFLAGS_vmlinux :=-p --no-undefined -X 16LDFLAGS_vmlinux :=-p --no-undefined -X
17ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) 17ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
18LDFLAGS_vmlinux += --be8 18LDFLAGS_vmlinux += --be8
19LDFLAGS_MODULE += --be8
19endif 20endif
20 21
21OBJCOPYFLAGS :=-O binary -R .comment -S 22OBJCOPYFLAGS :=-O binary -R .comment -S
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 75189f13cf54..066b03480b63 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -135,6 +135,7 @@ start:
135 .word _edata @ zImage end address 135 .word _edata @ zImage end address
136 THUMB( .thumb ) 136 THUMB( .thumb )
1371: 1371:
138 ARM_BE8( setend be ) @ go BE8 if compiled for BE8
138 mrs r9, cpsr 139 mrs r9, cpsr
139#ifdef CONFIG_ARM_VIRT_EXT 140#ifdef CONFIG_ARM_VIRT_EXT
140 bl __hyp_stub_install @ get into SVC mode, reversibly 141 bl __hyp_stub_install @ get into SVC mode, reversibly
@@ -699,9 +700,7 @@ __armv4_mmu_cache_on:
699 mrc p15, 0, r0, c1, c0, 0 @ read control reg 700 mrc p15, 0, r0, c1, c0, 0 @ read control reg
700 orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement 701 orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement
701 orr r0, r0, #0x0030 702 orr r0, r0, #0x0030
702#ifdef CONFIG_CPU_ENDIAN_BE8 703 ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables
703 orr r0, r0, #1 << 25 @ big-endian page tables
704#endif
705 bl __common_mmu_cache_on 704 bl __common_mmu_cache_on
706 mov r0, #0 705 mov r0, #0
707 mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs 706 mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
@@ -728,9 +727,7 @@ __armv7_mmu_cache_on:
728 orr r0, r0, #1 << 22 @ U (v6 unaligned access model) 727 orr r0, r0, #1 << 22 @ U (v6 unaligned access model)
729 @ (needed for ARM1176) 728 @ (needed for ARM1176)
730#ifdef CONFIG_MMU 729#ifdef CONFIG_MMU
731#ifdef CONFIG_CPU_ENDIAN_BE8 730 ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables
732 orr r0, r0, #1 << 25 @ big-endian page tables
733#endif
734 mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg 731 mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg
735 orrne r0, r0, #1 @ MMU enabled 732 orrne r0, r0, #1 @ MMU enabled
736 movne r1, #0xfffffffd @ domain 0 = client 733 movne r1, #0xfffffffd @ domain 0 = client
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index eaa9cf4705a7..4bdc41622c36 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -16,3 +16,5 @@ obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
16AFLAGS_mcpm_head.o := -march=armv7-a 16AFLAGS_mcpm_head.o := -march=armv7-a
17AFLAGS_vlock.o := -march=armv7-a 17AFLAGS_vlock.o := -march=armv7-a
18obj-$(CONFIG_TI_PRIV_EDMA) += edma.o 18obj-$(CONFIG_TI_PRIV_EDMA) += edma.o
19obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o
20obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
new file mode 100644
index 000000000000..5774b6ea7ad5
--- /dev/null
+++ b/arch/arm/common/bL_switcher.c
@@ -0,0 +1,822 @@
1/*
2 * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
3 *
4 * Created by: Nicolas Pitre, March 2012
5 * Copyright: (C) 2012-2013 Linaro Limited
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/atomic.h>
13#include <linux/init.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/sched.h>
17#include <linux/interrupt.h>
18#include <linux/cpu_pm.h>
19#include <linux/cpu.h>
20#include <linux/cpumask.h>
21#include <linux/kthread.h>
22#include <linux/wait.h>
23#include <linux/time.h>
24#include <linux/clockchips.h>
25#include <linux/hrtimer.h>
26#include <linux/tick.h>
27#include <linux/notifier.h>
28#include <linux/mm.h>
29#include <linux/mutex.h>
30#include <linux/smp.h>
31#include <linux/spinlock.h>
32#include <linux/string.h>
33#include <linux/sysfs.h>
34#include <linux/irqchip/arm-gic.h>
35#include <linux/moduleparam.h>
36
37#include <asm/smp_plat.h>
38#include <asm/cputype.h>
39#include <asm/suspend.h>
40#include <asm/mcpm.h>
41#include <asm/bL_switcher.h>
42
43#define CREATE_TRACE_POINTS
44#include <trace/events/power_cpu_migrate.h>
45
46
47/*
48 * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
49 * __attribute_const__ and we don't want the compiler to assume any
50 * constness here as the value _does_ change along some code paths.
51 */
52
53static int read_mpidr(void)
54{
55 unsigned int id;
56 asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
57 return id & MPIDR_HWID_BITMASK;
58}
59
60/*
61 * Get a global nanosecond time stamp for tracing.
62 */
63static s64 get_ns(void)
64{
65 struct timespec ts;
66 getnstimeofday(&ts);
67 return timespec_to_ns(&ts);
68}
69
70/*
71 * bL switcher core code.
72 */
73
74static void bL_do_switch(void *_arg)
75{
76 unsigned ib_mpidr, ib_cpu, ib_cluster;
77 long volatile handshake, **handshake_ptr = _arg;
78
79 pr_debug("%s\n", __func__);
80
81 ib_mpidr = cpu_logical_map(smp_processor_id());
82 ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
83 ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
84
85 /* Advertise our handshake location */
86 if (handshake_ptr) {
87 handshake = 0;
88 *handshake_ptr = &handshake;
89 } else
90 handshake = -1;
91
92 /*
93 * Our state has been saved at this point. Let's release our
94 * inbound CPU.
95 */
96 mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
97 sev();
98
99 /*
100 * From this point, we must assume that our counterpart CPU might
101 * have taken over in its parallel world already, as if execution
102 * just returned from cpu_suspend(). It is therefore important to
103 * be very careful not to make any change the other guy is not
104 * expecting. This is why we need stack isolation.
105 *
106 * Fancy under cover tasks could be performed here. For now
107 * we have none.
108 */
109
110 /*
111 * Let's wait until our inbound is alive.
112 */
113 while (!handshake) {
114 wfe();
115 smp_mb();
116 }
117
118 /* Let's put ourself down. */
119 mcpm_cpu_power_down();
120
121 /* should never get here */
122 BUG();
123}
124
125/*
126 * Stack isolation. To ensure 'current' remains valid, we just use another
127 * piece of our thread's stack space which should be fairly lightly used.
128 * The selected area starts just above the thread_info structure located
129 * at the very bottom of the stack, aligned to a cache line, and indexed
130 * with the cluster number.
131 */
132#define STACK_SIZE 512
133extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
134static int bL_switchpoint(unsigned long _arg)
135{
136 unsigned int mpidr = read_mpidr();
137 unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
138 void *stack = current_thread_info() + 1;
139 stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
140 stack += clusterid * STACK_SIZE + STACK_SIZE;
141 call_with_stack(bL_do_switch, (void *)_arg, stack);
142 BUG();
143}
144
145/*
146 * Generic switcher interface
147 */
148
149static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
150static int bL_switcher_cpu_pairing[NR_CPUS];
151
152/*
153 * bL_switch_to - Switch to a specific cluster for the current CPU
154 * @new_cluster_id: the ID of the cluster to switch to.
155 *
156 * This function must be called on the CPU to be switched.
157 * Returns 0 on success, else a negative status code.
158 */
159static int bL_switch_to(unsigned int new_cluster_id)
160{
161 unsigned int mpidr, this_cpu, that_cpu;
162 unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
163 struct completion inbound_alive;
164 struct tick_device *tdev;
165 enum clock_event_mode tdev_mode;
166 long volatile *handshake_ptr;
167 int ipi_nr, ret;
168
169 this_cpu = smp_processor_id();
170 ob_mpidr = read_mpidr();
171 ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
172 ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
173 BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
174
175 if (new_cluster_id == ob_cluster)
176 return 0;
177
178 that_cpu = bL_switcher_cpu_pairing[this_cpu];
179 ib_mpidr = cpu_logical_map(that_cpu);
180 ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
181 ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
182
183 pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
184 this_cpu, ob_mpidr, ib_mpidr);
185
186 this_cpu = smp_processor_id();
187
188 /* Close the gate for our entry vectors */
189 mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
190 mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
191
192 /* Install our "inbound alive" notifier. */
193 init_completion(&inbound_alive);
194 ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
195 ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
196 mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
197
198 /*
199 * Let's wake up the inbound CPU now in case it requires some delay
200 * to come online, but leave it gated in our entry vector code.
201 */
202 ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
203 if (ret) {
204 pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
205 return ret;
206 }
207
208 /*
209 * Raise a SGI on the inbound CPU to make sure it doesn't stall
210 * in a possible WFI, such as in bL_power_down().
211 */
212 gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
213
214 /*
215 * Wait for the inbound to come up. This allows for other
216 * tasks to be scheduled in the mean time.
217 */
218 wait_for_completion(&inbound_alive);
219 mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
220
221 /*
222 * From this point we are entering the switch critical zone
223 * and can't take any interrupts anymore.
224 */
225 local_irq_disable();
226 local_fiq_disable();
227 trace_cpu_migrate_begin(get_ns(), ob_mpidr);
228
229 /* redirect GIC's SGIs to our counterpart */
230 gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
231
232 tdev = tick_get_device(this_cpu);
233 if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
234 tdev = NULL;
235 if (tdev) {
236 tdev_mode = tdev->evtdev->mode;
237 clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
238 }
239
240 ret = cpu_pm_enter();
241
242 /* we can not tolerate errors at this point */
243 if (ret)
244 panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
245
246 /* Swap the physical CPUs in the logical map for this logical CPU. */
247 cpu_logical_map(this_cpu) = ib_mpidr;
248 cpu_logical_map(that_cpu) = ob_mpidr;
249
250 /* Let's do the actual CPU switch. */
251 ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
252 if (ret > 0)
253 panic("%s: cpu_suspend() returned %d\n", __func__, ret);
254
255 /* We are executing on the inbound CPU at this point */
256 mpidr = read_mpidr();
257 pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
258 BUG_ON(mpidr != ib_mpidr);
259
260 mcpm_cpu_powered_up();
261
262 ret = cpu_pm_exit();
263
264 if (tdev) {
265 clockevents_set_mode(tdev->evtdev, tdev_mode);
266 clockevents_program_event(tdev->evtdev,
267 tdev->evtdev->next_event, 1);
268 }
269
270 trace_cpu_migrate_finish(get_ns(), ib_mpidr);
271 local_fiq_enable();
272 local_irq_enable();
273
274 *handshake_ptr = 1;
275 dsb_sev();
276
277 if (ret)
278 pr_err("%s exiting with error %d\n", __func__, ret);
279 return ret;
280}
281
282struct bL_thread {
283 spinlock_t lock;
284 struct task_struct *task;
285 wait_queue_head_t wq;
286 int wanted_cluster;
287 struct completion started;
288 bL_switch_completion_handler completer;
289 void *completer_cookie;
290};
291
292static struct bL_thread bL_threads[NR_CPUS];
293
294static int bL_switcher_thread(void *arg)
295{
296 struct bL_thread *t = arg;
297 struct sched_param param = { .sched_priority = 1 };
298 int cluster;
299 bL_switch_completion_handler completer;
300 void *completer_cookie;
301
302 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
303 complete(&t->started);
304
305 do {
306 if (signal_pending(current))
307 flush_signals(current);
308 wait_event_interruptible(t->wq,
309 t->wanted_cluster != -1 ||
310 kthread_should_stop());
311
312 spin_lock(&t->lock);
313 cluster = t->wanted_cluster;
314 completer = t->completer;
315 completer_cookie = t->completer_cookie;
316 t->wanted_cluster = -1;
317 t->completer = NULL;
318 spin_unlock(&t->lock);
319
320 if (cluster != -1) {
321 bL_switch_to(cluster);
322
323 if (completer)
324 completer(completer_cookie);
325 }
326 } while (!kthread_should_stop());
327
328 return 0;
329}
330
331static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
332{
333 struct task_struct *task;
334
335 task = kthread_create_on_node(bL_switcher_thread, arg,
336 cpu_to_node(cpu), "kswitcher_%d", cpu);
337 if (!IS_ERR(task)) {
338 kthread_bind(task, cpu);
339 wake_up_process(task);
340 } else
341 pr_err("%s failed for CPU %d\n", __func__, cpu);
342 return task;
343}
344
345/*
346 * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
347 * with completion notification via a callback
348 *
349 * @cpu: the CPU to switch
350 * @new_cluster_id: the ID of the cluster to switch to.
351 * @completer: switch completion callback. if non-NULL,
352 * @completer(@completer_cookie) will be called on completion of
353 * the switch, in non-atomic context.
354 * @completer_cookie: opaque context argument for @completer.
355 *
356 * This function causes a cluster switch on the given CPU by waking up
357 * the appropriate switcher thread. This function may or may not return
358 * before the switch has occurred.
359 *
360 * If a @completer callback function is supplied, it will be called when
361 * the switch is complete. This can be used to determine asynchronously
362 * when the switch is complete, regardless of when bL_switch_request()
363 * returns. When @completer is supplied, no new switch request is permitted
364 * for the affected CPU until after the switch is complete, and @completer
365 * has returned.
366 */
367int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
368 bL_switch_completion_handler completer,
369 void *completer_cookie)
370{
371 struct bL_thread *t;
372
373 if (cpu >= ARRAY_SIZE(bL_threads)) {
374 pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
375 return -EINVAL;
376 }
377
378 t = &bL_threads[cpu];
379
380 if (IS_ERR(t->task))
381 return PTR_ERR(t->task);
382 if (!t->task)
383 return -ESRCH;
384
385 spin_lock(&t->lock);
386 if (t->completer) {
387 spin_unlock(&t->lock);
388 return -EBUSY;
389 }
390 t->completer = completer;
391 t->completer_cookie = completer_cookie;
392 t->wanted_cluster = new_cluster_id;
393 spin_unlock(&t->lock);
394 wake_up(&t->wq);
395 return 0;
396}
397EXPORT_SYMBOL_GPL(bL_switch_request_cb);
398
399/*
400 * Activation and configuration code.
401 */
402
403static DEFINE_MUTEX(bL_switcher_activation_lock);
404static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
405static unsigned int bL_switcher_active;
406static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
407static cpumask_t bL_switcher_removed_logical_cpus;
408
409int bL_switcher_register_notifier(struct notifier_block *nb)
410{
411 return blocking_notifier_chain_register(&bL_activation_notifier, nb);
412}
413EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
414
415int bL_switcher_unregister_notifier(struct notifier_block *nb)
416{
417 return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
418}
419EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
420
421static int bL_activation_notify(unsigned long val)
422{
423 int ret;
424
425 ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
426 if (ret & NOTIFY_STOP_MASK)
427 pr_err("%s: notifier chain failed with status 0x%x\n",
428 __func__, ret);
429 return notifier_to_errno(ret);
430}
431
432static void bL_switcher_restore_cpus(void)
433{
434 int i;
435
436 for_each_cpu(i, &bL_switcher_removed_logical_cpus)
437 cpu_up(i);
438}
439
440static int bL_switcher_halve_cpus(void)
441{
442 int i, j, cluster_0, gic_id, ret;
443 unsigned int cpu, cluster, mask;
444 cpumask_t available_cpus;
445
446 /* First pass to validate what we have */
447 mask = 0;
448 for_each_online_cpu(i) {
449 cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
450 cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
451 if (cluster >= 2) {
452 pr_err("%s: only dual cluster systems are supported\n", __func__);
453 return -EINVAL;
454 }
455 if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
456 return -EINVAL;
457 mask |= (1 << cluster);
458 }
459 if (mask != 3) {
460 pr_err("%s: no CPU pairing possible\n", __func__);
461 return -EINVAL;
462 }
463
464 /*
465 * Now let's do the pairing. We match each CPU with another CPU
466 * from a different cluster. To get a uniform scheduling behavior
467 * without fiddling with CPU topology and compute capacity data,
468 * we'll use logical CPUs initially belonging to the same cluster.
469 */
470 memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
471 cpumask_copy(&available_cpus, cpu_online_mask);
472 cluster_0 = -1;
473 for_each_cpu(i, &available_cpus) {
474 int match = -1;
475 cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
476 if (cluster_0 == -1)
477 cluster_0 = cluster;
478 if (cluster != cluster_0)
479 continue;
480 cpumask_clear_cpu(i, &available_cpus);
481 for_each_cpu(j, &available_cpus) {
482 cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
483 /*
484 * Let's remember the last match to create "odd"
485 * pairings on purpose in order for other code not
486 * to assume any relation between physical and
487 * logical CPU numbers.
488 */
489 if (cluster != cluster_0)
490 match = j;
491 }
492 if (match != -1) {
493 bL_switcher_cpu_pairing[i] = match;
494 cpumask_clear_cpu(match, &available_cpus);
495 pr_info("CPU%d paired with CPU%d\n", i, match);
496 }
497 }
498
499 /*
500 * Now we disable the unwanted CPUs i.e. everything that has no
501 * pairing information (that includes the pairing counterparts).
502 */
503 cpumask_clear(&bL_switcher_removed_logical_cpus);
504 for_each_online_cpu(i) {
505 cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
506 cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
507
508 /* Let's take note of the GIC ID for this CPU */
509 gic_id = gic_get_cpu_id(i);
510 if (gic_id < 0) {
511 pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
512 bL_switcher_restore_cpus();
513 return -EINVAL;
514 }
515 bL_gic_id[cpu][cluster] = gic_id;
516 pr_info("GIC ID for CPU %u cluster %u is %u\n",
517 cpu, cluster, gic_id);
518
519 if (bL_switcher_cpu_pairing[i] != -1) {
520 bL_switcher_cpu_original_cluster[i] = cluster;
521 continue;
522 }
523
524 ret = cpu_down(i);
525 if (ret) {
526 bL_switcher_restore_cpus();
527 return ret;
528 }
529 cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
530 }
531
532 return 0;
533}
534
535/* Determine the logical CPU a given physical CPU is grouped on. */
536int bL_switcher_get_logical_index(u32 mpidr)
537{
538 int cpu;
539
540 if (!bL_switcher_active)
541 return -EUNATCH;
542
543 mpidr &= MPIDR_HWID_BITMASK;
544 for_each_online_cpu(cpu) {
545 int pairing = bL_switcher_cpu_pairing[cpu];
546 if (pairing == -1)
547 continue;
548 if ((mpidr == cpu_logical_map(cpu)) ||
549 (mpidr == cpu_logical_map(pairing)))
550 return cpu;
551 }
552 return -EINVAL;
553}
554
555static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
556{
557 trace_cpu_migrate_current(get_ns(), read_mpidr());
558}
559
560int bL_switcher_trace_trigger(void)
561{
562 int ret;
563
564 preempt_disable();
565
566 bL_switcher_trace_trigger_cpu(NULL);
567 ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
568
569 preempt_enable();
570
571 return ret;
572}
573EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
574
575static int bL_switcher_enable(void)
576{
577 int cpu, ret;
578
579 mutex_lock(&bL_switcher_activation_lock);
580 lock_device_hotplug();
581 if (bL_switcher_active) {
582 unlock_device_hotplug();
583 mutex_unlock(&bL_switcher_activation_lock);
584 return 0;
585 }
586
587 pr_info("big.LITTLE switcher initializing\n");
588
589 ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
590 if (ret)
591 goto error;
592
593 ret = bL_switcher_halve_cpus();
594 if (ret)
595 goto error;
596
597 bL_switcher_trace_trigger();
598
599 for_each_online_cpu(cpu) {
600 struct bL_thread *t = &bL_threads[cpu];
601 spin_lock_init(&t->lock);
602 init_waitqueue_head(&t->wq);
603 init_completion(&t->started);
604 t->wanted_cluster = -1;
605 t->task = bL_switcher_thread_create(cpu, t);
606 }
607
608 bL_switcher_active = 1;
609 bL_activation_notify(BL_NOTIFY_POST_ENABLE);
610 pr_info("big.LITTLE switcher initialized\n");
611 goto out;
612
613error:
614 pr_warn("big.LITTLE switcher initialization failed\n");
615 bL_activation_notify(BL_NOTIFY_POST_DISABLE);
616
617out:
618 unlock_device_hotplug();
619 mutex_unlock(&bL_switcher_activation_lock);
620 return ret;
621}
622
623#ifdef CONFIG_SYSFS
624
625static void bL_switcher_disable(void)
626{
627 unsigned int cpu, cluster;
628 struct bL_thread *t;
629 struct task_struct *task;
630
631 mutex_lock(&bL_switcher_activation_lock);
632 lock_device_hotplug();
633
634 if (!bL_switcher_active)
635 goto out;
636
637 if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
638 bL_activation_notify(BL_NOTIFY_POST_ENABLE);
639 goto out;
640 }
641
642 bL_switcher_active = 0;
643
644 /*
645 * To deactivate the switcher, we must shut down the switcher
646 * threads to prevent any other requests from being accepted.
647 * Then, if the final cluster for given logical CPU is not the
648 * same as the original one, we'll recreate a switcher thread
649 * just for the purpose of switching the CPU back without any
650 * possibility for interference from external requests.
651 */
652 for_each_online_cpu(cpu) {
653 t = &bL_threads[cpu];
654 task = t->task;
655 t->task = NULL;
656 if (!task || IS_ERR(task))
657 continue;
658 kthread_stop(task);
659 /* no more switch may happen on this CPU at this point */
660 cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
661 if (cluster == bL_switcher_cpu_original_cluster[cpu])
662 continue;
663 init_completion(&t->started);
664 t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
665 task = bL_switcher_thread_create(cpu, t);
666 if (!IS_ERR(task)) {
667 wait_for_completion(&t->started);
668 kthread_stop(task);
669 cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
670 if (cluster == bL_switcher_cpu_original_cluster[cpu])
671 continue;
672 }
673 /* If execution gets here, we're in trouble. */
674 pr_crit("%s: unable to restore original cluster for CPU %d\n",
675 __func__, cpu);
676 pr_crit("%s: CPU %d can't be restored\n",
677 __func__, bL_switcher_cpu_pairing[cpu]);
678 cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
679 &bL_switcher_removed_logical_cpus);
680 }
681
682 bL_switcher_restore_cpus();
683 bL_switcher_trace_trigger();
684
685 bL_activation_notify(BL_NOTIFY_POST_DISABLE);
686
687out:
688 unlock_device_hotplug();
689 mutex_unlock(&bL_switcher_activation_lock);
690}
691
692static ssize_t bL_switcher_active_show(struct kobject *kobj,
693 struct kobj_attribute *attr, char *buf)
694{
695 return sprintf(buf, "%u\n", bL_switcher_active);
696}
697
698static ssize_t bL_switcher_active_store(struct kobject *kobj,
699 struct kobj_attribute *attr, const char *buf, size_t count)
700{
701 int ret;
702
703 switch (buf[0]) {
704 case '0':
705 bL_switcher_disable();
706 ret = 0;
707 break;
708 case '1':
709 ret = bL_switcher_enable();
710 break;
711 default:
712 ret = -EINVAL;
713 }
714
715 return (ret >= 0) ? count : ret;
716}
717
718static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
719 struct kobj_attribute *attr, const char *buf, size_t count)
720{
721 int ret = bL_switcher_trace_trigger();
722
723 return ret ? ret : count;
724}
725
726static struct kobj_attribute bL_switcher_active_attr =
727 __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
728
729static struct kobj_attribute bL_switcher_trace_trigger_attr =
730 __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
731
732static struct attribute *bL_switcher_attrs[] = {
733 &bL_switcher_active_attr.attr,
734 &bL_switcher_trace_trigger_attr.attr,
735 NULL,
736};
737
738static struct attribute_group bL_switcher_attr_group = {
739 .attrs = bL_switcher_attrs,
740};
741
742static struct kobject *bL_switcher_kobj;
743
744static int __init bL_switcher_sysfs_init(void)
745{
746 int ret;
747
748 bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
749 if (!bL_switcher_kobj)
750 return -ENOMEM;
751 ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
752 if (ret)
753 kobject_put(bL_switcher_kobj);
754 return ret;
755}
756
757#endif /* CONFIG_SYSFS */
758
759bool bL_switcher_get_enabled(void)
760{
761 mutex_lock(&bL_switcher_activation_lock);
762
763 return bL_switcher_active;
764}
765EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
766
767void bL_switcher_put_enabled(void)
768{
769 mutex_unlock(&bL_switcher_activation_lock);
770}
771EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
772
773/*
774 * Veto any CPU hotplug operation on those CPUs we've removed
775 * while the switcher is active.
776 * We're just not ready to deal with that given the trickery involved.
777 */
778static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
779 unsigned long action, void *hcpu)
780{
781 if (bL_switcher_active) {
782 int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
783 switch (action & 0xf) {
784 case CPU_UP_PREPARE:
785 case CPU_DOWN_PREPARE:
786 if (pairing == -1)
787 return NOTIFY_BAD;
788 }
789 }
790 return NOTIFY_DONE;
791}
792
793static bool no_bL_switcher;
794core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
795
796static int __init bL_switcher_init(void)
797{
798 int ret;
799
800 if (MAX_NR_CLUSTERS != 2) {
801 pr_err("%s: only dual cluster systems are supported\n", __func__);
802 return -EINVAL;
803 }
804
805 cpu_notifier(bL_switcher_hotplug_callback, 0);
806
807 if (!no_bL_switcher) {
808 ret = bL_switcher_enable();
809 if (ret)
810 return ret;
811 }
812
813#ifdef CONFIG_SYSFS
814 ret = bL_switcher_sysfs_init();
815 if (ret)
816 pr_err("%s: unable to create sysfs entry\n", __func__);
817#endif
818
819 return 0;
820}
821
822late_initcall(bL_switcher_init);
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c
new file mode 100644
index 000000000000..3f47f1203c6b
--- /dev/null
+++ b/arch/arm/common/bL_switcher_dummy_if.c
@@ -0,0 +1,71 @@
1/*
2 * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
3 *
4 * Created by: Nicolas Pitre, November 2012
5 * Copyright: (C) 2012-2013 Linaro Limited
6 *
7 * Dummy interface to user space for debugging purpose only.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/init.h>
15#include <linux/module.h>
16#include <linux/fs.h>
17#include <linux/miscdevice.h>
18#include <asm/uaccess.h>
19#include <asm/bL_switcher.h>
20
21static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
22 size_t len, loff_t *pos)
23{
24 unsigned char val[3];
25 unsigned int cpu, cluster;
26 int ret;
27
28 pr_debug("%s\n", __func__);
29
30 if (len < 3)
31 return -EINVAL;
32
33 if (copy_from_user(val, buf, 3))
34 return -EFAULT;
35
36 /* format: <cpu#>,<cluster#> */
37 if (val[0] < '0' || val[0] > '9' ||
38 val[1] != ',' ||
39 val[2] < '0' || val[2] > '1')
40 return -EINVAL;
41
42 cpu = val[0] - '0';
43 cluster = val[2] - '0';
44 ret = bL_switch_request(cpu, cluster);
45
46 return ret ? : len;
47}
48
49static const struct file_operations bL_switcher_fops = {
50 .write = bL_switcher_write,
51 .owner = THIS_MODULE,
52};
53
54static struct miscdevice bL_switcher_device = {
55 MISC_DYNAMIC_MINOR,
56 "b.L_switcher",
57 &bL_switcher_fops
58};
59
60static int __init bL_switcher_dummy_if_init(void)
61{
62 return misc_register(&bL_switcher_device);
63}
64
65static void __exit bL_switcher_dummy_if_exit(void)
66{
67 misc_deregister(&bL_switcher_device);
68}
69
70module_init(bL_switcher_dummy_if_init);
71module_exit(bL_switcher_dummy_if_exit);
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
index 990250965f2c..26020a03f659 100644
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
27 sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); 27 sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
28} 28}
29 29
30extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
31
32void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
33 unsigned long poke_phys_addr, unsigned long poke_val)
34{
35 unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
36 poke[0] = poke_phys_addr;
37 poke[1] = poke_val;
38 __cpuc_flush_dcache_area((void *)poke, 8);
39 outer_clean_range(__pa(poke), __pa(poke + 2));
40}
41
30static const struct mcpm_platform_ops *platform_ops; 42static const struct mcpm_platform_ops *platform_ops;
31 43
32int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) 44int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
@@ -90,6 +102,21 @@ void mcpm_cpu_power_down(void)
90 BUG(); 102 BUG();
91} 103}
92 104
105int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster)
106{
107 int ret;
108
109 if (WARN_ON_ONCE(!platform_ops || !platform_ops->power_down_finish))
110 return -EUNATCH;
111
112 ret = platform_ops->power_down_finish(cpu, cluster);
113 if (ret)
114 pr_warn("%s: cpu %u, cluster %u failed to power down (%d)\n",
115 __func__, cpu, cluster, ret);
116
117 return ret;
118}
119
93void mcpm_cpu_suspend(u64 expected_residency) 120void mcpm_cpu_suspend(u64 expected_residency)
94{ 121{
95 phys_reset_t phys_reset; 122 phys_reset_t phys_reset;
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index 39c96df3477a..e02db4b81a66 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -15,6 +15,7 @@
15 15
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17#include <asm/mcpm.h> 17#include <asm/mcpm.h>
18#include <asm/assembler.h>
18 19
19#include "vlock.h" 20#include "vlock.h"
20 21
@@ -47,6 +48,7 @@
47 48
48ENTRY(mcpm_entry_point) 49ENTRY(mcpm_entry_point)
49 50
51 ARM_BE8(setend be)
50 THUMB( adr r12, BSYM(1f) ) 52 THUMB( adr r12, BSYM(1f) )
51 THUMB( bx r12 ) 53 THUMB( bx r12 )
52 THUMB( .thumb ) 54 THUMB( .thumb )
@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point)
71 * position independent way. 73 * position independent way.
72 */ 74 */
73 adr r5, 3f 75 adr r5, 3f
74 ldmia r5, {r6, r7, r8, r11} 76 ldmia r5, {r0, r6, r7, r8, r11}
77 add r0, r5, r0 @ r0 = mcpm_entry_early_pokes
75 add r6, r5, r6 @ r6 = mcpm_entry_vectors 78 add r6, r5, r6 @ r6 = mcpm_entry_vectors
76 ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys 79 ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys
77 add r8, r5, r8 @ r8 = mcpm_sync 80 add r8, r5, r8 @ r8 = mcpm_sync
78 add r11, r5, r11 @ r11 = first_man_locks 81 add r11, r5, r11 @ r11 = first_man_locks
79 82
83 @ Perform an early poke, if any
84 add r0, r0, r4, lsl #3
85 ldmia r0, {r0, r1}
86 teq r0, #0
87 strne r1, [r0]
88
80 mov r0, #MCPM_SYNC_CLUSTER_SIZE 89 mov r0, #MCPM_SYNC_CLUSTER_SIZE
81 mla r8, r0, r10, r8 @ r8 = sync cluster base 90 mla r8, r0, r10, r8 @ r8 = sync cluster base
82 91
@@ -195,7 +204,8 @@ mcpm_entry_gated:
195 204
196 .align 2 205 .align 2
197 206
1983: .word mcpm_entry_vectors - . 2073: .word mcpm_entry_early_pokes - .
208 .word mcpm_entry_vectors - 3b
199 .word mcpm_power_up_setup_phys - 3b 209 .word mcpm_power_up_setup_phys - 3b
200 .word mcpm_sync - 3b 210 .word mcpm_sync - 3b
201 .word first_man_locks - 3b 211 .word first_man_locks - 3b
@@ -214,6 +224,10 @@ first_man_locks:
214ENTRY(mcpm_entry_vectors) 224ENTRY(mcpm_entry_vectors)
215 .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER 225 .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
216 226
227 .type mcpm_entry_early_pokes, #object
228ENTRY(mcpm_entry_early_pokes)
229 .space 8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
230
217 .type mcpm_power_up_setup_phys, #object 231 .type mcpm_power_up_setup_phys, #object
218ENTRY(mcpm_power_up_setup_phys) 232ENTRY(mcpm_power_up_setup_phys)
219 .space 4 @ set by mcpm_sync_init() 233 .space 4 @ set by mcpm_sync_init()
diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c
index 1bc34c7567fd..177251a4dd9a 100644
--- a/arch/arm/common/mcpm_platsmp.c
+++ b/arch/arm/common/mcpm_platsmp.c
@@ -19,14 +19,23 @@
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/smp_plat.h> 20#include <asm/smp_plat.h>
21 21
22static void cpu_to_pcpu(unsigned int cpu,
23 unsigned int *pcpu, unsigned int *pcluster)
24{
25 unsigned int mpidr;
26
27 mpidr = cpu_logical_map(cpu);
28 *pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
29 *pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
30}
31
22static int mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle) 32static int mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle)
23{ 33{
24 unsigned int mpidr, pcpu, pcluster, ret; 34 unsigned int pcpu, pcluster, ret;
25 extern void secondary_startup(void); 35 extern void secondary_startup(void);
26 36
27 mpidr = cpu_logical_map(cpu); 37 cpu_to_pcpu(cpu, &pcpu, &pcluster);
28 pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); 38
29 pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
30 pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n", 39 pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n",
31 __func__, cpu, pcpu, pcluster); 40 __func__, cpu, pcpu, pcluster);
32 41
@@ -47,6 +56,15 @@ static void mcpm_secondary_init(unsigned int cpu)
47 56
48#ifdef CONFIG_HOTPLUG_CPU 57#ifdef CONFIG_HOTPLUG_CPU
49 58
59static int mcpm_cpu_kill(unsigned int cpu)
60{
61 unsigned int pcpu, pcluster;
62
63 cpu_to_pcpu(cpu, &pcpu, &pcluster);
64
65 return !mcpm_cpu_power_down_finish(pcpu, pcluster);
66}
67
50static int mcpm_cpu_disable(unsigned int cpu) 68static int mcpm_cpu_disable(unsigned int cpu)
51{ 69{
52 /* 70 /*
@@ -73,6 +91,7 @@ static struct smp_operations __initdata mcpm_smp_ops = {
73 .smp_boot_secondary = mcpm_boot_secondary, 91 .smp_boot_secondary = mcpm_boot_secondary,
74 .smp_secondary_init = mcpm_secondary_init, 92 .smp_secondary_init = mcpm_secondary_init,
75#ifdef CONFIG_HOTPLUG_CPU 93#ifdef CONFIG_HOTPLUG_CPU
94 .cpu_kill = mcpm_cpu_kill,
76 .cpu_disable = mcpm_cpu_disable, 95 .cpu_disable = mcpm_cpu_disable,
77 .cpu_die = mcpm_cpu_die, 96 .cpu_die = mcpm_cpu_die,
78#endif 97#endif
diff --git a/arch/arm/common/timer-sp.c b/arch/arm/common/timer-sp.c
index e901d0f3e0bb..ce922d0ea7aa 100644
--- a/arch/arm/common/timer-sp.c
+++ b/arch/arm/common/timer-sp.c
@@ -175,7 +175,7 @@ static struct clock_event_device sp804_clockevent = {
175 175
176static struct irqaction sp804_timer_irq = { 176static struct irqaction sp804_timer_irq = {
177 .name = "timer", 177 .name = "timer",
178 .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, 178 .flags = IRQF_TIMER | IRQF_IRQPOLL,
179 .handler = sp804_timer_interrupt, 179 .handler = sp804_timer_interrupt,
180 .dev_id = &sp804_clockevent, 180 .dev_id = &sp804_clockevent,
181}; 181};
diff --git a/arch/arm/configs/h3600_defconfig b/arch/arm/configs/h3600_defconfig
index 317960f12488..0142ec37e0be 100644
--- a/arch/arm/configs/h3600_defconfig
+++ b/arch/arm/configs/h3600_defconfig
@@ -1,5 +1,6 @@
1CONFIG_EXPERIMENTAL=y
2CONFIG_SYSVIPC=y 1CONFIG_SYSVIPC=y
2CONFIG_NO_HZ_IDLE=y
3CONFIG_HIGH_RES_TIMERS=y
3CONFIG_LOG_BUF_SHIFT=14 4CONFIG_LOG_BUF_SHIFT=14
4CONFIG_BLK_DEV_INITRD=y 5CONFIG_BLK_DEV_INITRD=y
5CONFIG_MODULES=y 6CONFIG_MODULES=y
@@ -11,11 +12,11 @@ CONFIG_ARCH_SA1100=y
11CONFIG_SA1100_H3600=y 12CONFIG_SA1100_H3600=y
12CONFIG_PCCARD=y 13CONFIG_PCCARD=y
13CONFIG_PCMCIA_SA1100=y 14CONFIG_PCMCIA_SA1100=y
15CONFIG_PREEMPT=y
14CONFIG_ZBOOT_ROM_TEXT=0x0 16CONFIG_ZBOOT_ROM_TEXT=0x0
15CONFIG_ZBOOT_ROM_BSS=0x0 17CONFIG_ZBOOT_ROM_BSS=0x0
16# CONFIG_CPU_FREQ_STAT is not set 18# CONFIG_CPU_FREQ_STAT is not set
17CONFIG_FPE_NWFPE=y 19CONFIG_FPE_NWFPE=y
18CONFIG_PM=y
19CONFIG_NET=y 20CONFIG_NET=y
20CONFIG_UNIX=y 21CONFIG_UNIX=y
21CONFIG_INET=y 22CONFIG_INET=y
@@ -24,13 +25,10 @@ CONFIG_IRDA=m
24CONFIG_IRLAN=m 25CONFIG_IRLAN=m
25CONFIG_IRNET=m 26CONFIG_IRNET=m
26CONFIG_IRCOMM=m 27CONFIG_IRCOMM=m
27CONFIG_SA1100_FIR=m
28# CONFIG_WIRELESS is not set 28# CONFIG_WIRELESS is not set
29CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" 29CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
30CONFIG_MTD=y 30CONFIG_MTD=y
31CONFIG_MTD_PARTITIONS=y
32CONFIG_MTD_REDBOOT_PARTS=y 31CONFIG_MTD_REDBOOT_PARTS=y
33CONFIG_MTD_CHAR=y
34CONFIG_MTD_BLOCK=y 32CONFIG_MTD_BLOCK=y
35CONFIG_MTD_CFI=y 33CONFIG_MTD_CFI=y
36CONFIG_MTD_CFI_ADV_OPTIONS=y 34CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -41,19 +39,15 @@ CONFIG_MTD_SA1100=y
41CONFIG_BLK_DEV_LOOP=m 39CONFIG_BLK_DEV_LOOP=m
42CONFIG_BLK_DEV_RAM=y 40CONFIG_BLK_DEV_RAM=y
43CONFIG_BLK_DEV_RAM_SIZE=8192 41CONFIG_BLK_DEV_RAM_SIZE=8192
44# CONFIG_MISC_DEVICES is not set
45CONFIG_IDE=y 42CONFIG_IDE=y
46CONFIG_BLK_DEV_IDECS=y 43CONFIG_BLK_DEV_IDECS=y
47CONFIG_NETDEVICES=y 44CONFIG_NETDEVICES=y
48# CONFIG_NETDEV_1000 is not set
49# CONFIG_NETDEV_10000 is not set
50# CONFIG_WLAN is not set
51CONFIG_NET_PCMCIA=y
52CONFIG_PCMCIA_PCNET=y 45CONFIG_PCMCIA_PCNET=y
53CONFIG_PPP=m 46CONFIG_PPP=m
54CONFIG_PPP_ASYNC=m
55CONFIG_PPP_DEFLATE=m
56CONFIG_PPP_BSDCOMP=m 47CONFIG_PPP_BSDCOMP=m
48CONFIG_PPP_DEFLATE=m
49CONFIG_PPP_ASYNC=m
50# CONFIG_WLAN is not set
57# CONFIG_KEYBOARD_ATKBD is not set 51# CONFIG_KEYBOARD_ATKBD is not set
58CONFIG_KEYBOARD_GPIO=y 52CONFIG_KEYBOARD_GPIO=y
59# CONFIG_INPUT_MOUSE is not set 53# CONFIG_INPUT_MOUSE is not set
@@ -64,8 +58,6 @@ CONFIG_SERIAL_SA1100_CONSOLE=y
64# CONFIG_HWMON is not set 58# CONFIG_HWMON is not set
65CONFIG_FB=y 59CONFIG_FB=y
66CONFIG_FB_SA1100=y 60CONFIG_FB_SA1100=y
67# CONFIG_VGA_CONSOLE is not set
68# CONFIG_HID_SUPPORT is not set
69# CONFIG_USB_SUPPORT is not set 61# CONFIG_USB_SUPPORT is not set
70CONFIG_EXT2_FS=y 62CONFIG_EXT2_FS=y
71CONFIG_MSDOS_FS=m 63CONFIG_MSDOS_FS=m
@@ -74,6 +66,4 @@ CONFIG_JFFS2_FS=y
74CONFIG_CRAMFS=m 66CONFIG_CRAMFS=m
75CONFIG_NFS_FS=y 67CONFIG_NFS_FS=y
76CONFIG_NFSD=m 68CONFIG_NFSD=m
77CONFIG_SMB_FS=m
78CONFIG_NLS=y 69CONFIG_NLS=y
79# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
new file mode 100644
index 000000000000..6231d36b3635
--- /dev/null
+++ b/arch/arm/crypto/.gitignore
@@ -0,0 +1 @@
aesbs-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index a2c83851bc90..81cda39860c5 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -3,7 +3,17 @@
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o 5obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
6obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
6obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o 7obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
7 8
8aes-arm-y := aes-armv4.o aes_glue.o 9aes-arm-y := aes-armv4.o aes_glue.o
9sha1-arm-y := sha1-armv4-large.o sha1_glue.o 10aes-arm-bs-y := aesbs-core.o aesbs-glue.o
11sha1-arm-y := sha1-armv4-large.o sha1_glue.o
12
13quiet_cmd_perl = PERL $@
14 cmd_perl = $(PERL) $(<) > $(@)
15
16$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
17 $(call cmd,perl)
18
19.PRECIOUS: $(obj)/aesbs-core.S
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c
index 59f7877ead6a..3003fa1f6fb4 100644
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -6,22 +6,12 @@
6#include <linux/crypto.h> 6#include <linux/crypto.h>
7#include <crypto/aes.h> 7#include <crypto/aes.h>
8 8
9#define AES_MAXNR 14 9#include "aes_glue.h"
10 10
11typedef struct { 11EXPORT_SYMBOL(AES_encrypt);
12 unsigned int rd_key[4 *(AES_MAXNR + 1)]; 12EXPORT_SYMBOL(AES_decrypt);
13 int rounds; 13EXPORT_SYMBOL(private_AES_set_encrypt_key);
14} AES_KEY; 14EXPORT_SYMBOL(private_AES_set_decrypt_key);
15
16struct AES_CTX {
17 AES_KEY enc_key;
18 AES_KEY dec_key;
19};
20
21asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
22asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
23asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
24asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
25 15
26static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 16static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
27{ 17{
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
81 .cipher = { 71 .cipher = {
82 .cia_min_keysize = AES_MIN_KEY_SIZE, 72 .cia_min_keysize = AES_MIN_KEY_SIZE,
83 .cia_max_keysize = AES_MAX_KEY_SIZE, 73 .cia_max_keysize = AES_MAX_KEY_SIZE,
84 .cia_setkey = aes_set_key, 74 .cia_setkey = aes_set_key,
85 .cia_encrypt = aes_encrypt, 75 .cia_encrypt = aes_encrypt,
86 .cia_decrypt = aes_decrypt 76 .cia_decrypt = aes_decrypt
87 } 77 }
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h
new file mode 100644
index 000000000000..cca3e51eb606
--- /dev/null
+++ b/arch/arm/crypto/aes_glue.h
@@ -0,0 +1,19 @@
1
2#define AES_MAXNR 14
3
4struct AES_KEY {
5 unsigned int rd_key[4 * (AES_MAXNR + 1)];
6 int rounds;
7};
8
9struct AES_CTX {
10 struct AES_KEY enc_key;
11 struct AES_KEY dec_key;
12};
13
14asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
15asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
16asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
17 const int bits, struct AES_KEY *key);
18asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
19 const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped
new file mode 100644
index 000000000000..64205d453260
--- /dev/null
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -0,0 +1,2544 @@
1
2@ ====================================================================
3@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
4@ project. The module is, however, dual licensed under OpenSSL and
5@ CRYPTOGAMS licenses depending on where you obtain it. For further
6@ details see http://www.openssl.org/~appro/cryptogams/.
7@
8@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
9@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
10@ granted.
11@ ====================================================================
12
13@ Bit-sliced AES for ARM NEON
14@
15@ February 2012.
16@
17@ This implementation is direct adaptation of bsaes-x86_64 module for
18@ ARM NEON. Except that this module is endian-neutral [in sense that
19@ it can be compiled for either endianness] by courtesy of vld1.8's
20@ neutrality. Initial version doesn't implement interface to OpenSSL,
21@ only low-level primitives and unsupported entry points, just enough
22@ to collect performance results, which for Cortex-A8 core are:
23@
24@ encrypt 19.5 cycles per byte processed with 128-bit key
25@ decrypt 22.1 cycles per byte processed with 128-bit key
26@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
27@
28@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
29@ which is [much] worse than anticipated (for further details see
30@ http://www.openssl.org/~appro/Snapdragon-S4.html).
31@
32@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
33@ manages in 20.0 cycles].
34@
35@ When comparing to x86_64 results keep in mind that NEON unit is
36@ [mostly] single-issue and thus can't [fully] benefit from
37@ instruction-level parallelism. And when comparing to aes-armv4
38@ results keep in mind key schedule conversion overhead (see
39@ bsaes-x86_64.pl for further details)...
40@
41@ <appro@openssl.org>
42
43@ April-August 2013
44@
45@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
46@
47@ <ard.biesheuvel@linaro.org>
48
49#ifndef __KERNEL__
50# include "arm_arch.h"
51
52# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
53# define VFP_ABI_POP vldmia sp!,{d8-d15}
54# define VFP_ABI_FRAME 0x40
55#else
56# define VFP_ABI_PUSH
57# define VFP_ABI_POP
58# define VFP_ABI_FRAME 0
59# define BSAES_ASM_EXTENDED_KEY
60# define XTS_CHAIN_TWEAK
61# define __ARM_ARCH__ __LINUX_ARM_ARCH__
62#endif
63
64#ifdef __thumb__
65# define adrl adr
66#endif
67
68#if __ARM_ARCH__>=7
69.text
70.syntax unified @ ARMv7-capable assembler is expected to handle this
71#ifdef __thumb2__
72.thumb
73#else
74.code 32
75#endif
76
77.fpu neon
78
79.type _bsaes_decrypt8,%function
80.align 4
81_bsaes_decrypt8:
82 adr r6,_bsaes_decrypt8
83 vldmia r4!, {q9} @ round 0 key
84 add r6,r6,#.LM0ISR-_bsaes_decrypt8
85
86 vldmia r6!, {q8} @ .LM0ISR
87 veor q10, q0, q9 @ xor with round0 key
88 veor q11, q1, q9
89 vtbl.8 d0, {q10}, d16
90 vtbl.8 d1, {q10}, d17
91 veor q12, q2, q9
92 vtbl.8 d2, {q11}, d16
93 vtbl.8 d3, {q11}, d17
94 veor q13, q3, q9
95 vtbl.8 d4, {q12}, d16
96 vtbl.8 d5, {q12}, d17
97 veor q14, q4, q9
98 vtbl.8 d6, {q13}, d16
99 vtbl.8 d7, {q13}, d17
100 veor q15, q5, q9
101 vtbl.8 d8, {q14}, d16
102 vtbl.8 d9, {q14}, d17
103 veor q10, q6, q9
104 vtbl.8 d10, {q15}, d16
105 vtbl.8 d11, {q15}, d17
106 veor q11, q7, q9
107 vtbl.8 d12, {q10}, d16
108 vtbl.8 d13, {q10}, d17
109 vtbl.8 d14, {q11}, d16
110 vtbl.8 d15, {q11}, d17
111 vmov.i8 q8,#0x55 @ compose .LBS0
112 vmov.i8 q9,#0x33 @ compose .LBS1
113 vshr.u64 q10, q6, #1
114 vshr.u64 q11, q4, #1
115 veor q10, q10, q7
116 veor q11, q11, q5
117 vand q10, q10, q8
118 vand q11, q11, q8
119 veor q7, q7, q10
120 vshl.u64 q10, q10, #1
121 veor q5, q5, q11
122 vshl.u64 q11, q11, #1
123 veor q6, q6, q10
124 veor q4, q4, q11
125 vshr.u64 q10, q2, #1
126 vshr.u64 q11, q0, #1
127 veor q10, q10, q3
128 veor q11, q11, q1
129 vand q10, q10, q8
130 vand q11, q11, q8
131 veor q3, q3, q10
132 vshl.u64 q10, q10, #1
133 veor q1, q1, q11
134 vshl.u64 q11, q11, #1
135 veor q2, q2, q10
136 veor q0, q0, q11
137 vmov.i8 q8,#0x0f @ compose .LBS2
138 vshr.u64 q10, q5, #2
139 vshr.u64 q11, q4, #2
140 veor q10, q10, q7
141 veor q11, q11, q6
142 vand q10, q10, q9
143 vand q11, q11, q9
144 veor q7, q7, q10
145 vshl.u64 q10, q10, #2
146 veor q6, q6, q11
147 vshl.u64 q11, q11, #2
148 veor q5, q5, q10
149 veor q4, q4, q11
150 vshr.u64 q10, q1, #2
151 vshr.u64 q11, q0, #2
152 veor q10, q10, q3
153 veor q11, q11, q2
154 vand q10, q10, q9
155 vand q11, q11, q9
156 veor q3, q3, q10
157 vshl.u64 q10, q10, #2
158 veor q2, q2, q11
159 vshl.u64 q11, q11, #2
160 veor q1, q1, q10
161 veor q0, q0, q11
162 vshr.u64 q10, q3, #4
163 vshr.u64 q11, q2, #4
164 veor q10, q10, q7
165 veor q11, q11, q6
166 vand q10, q10, q8
167 vand q11, q11, q8
168 veor q7, q7, q10
169 vshl.u64 q10, q10, #4
170 veor q6, q6, q11
171 vshl.u64 q11, q11, #4
172 veor q3, q3, q10
173 veor q2, q2, q11
174 vshr.u64 q10, q1, #4
175 vshr.u64 q11, q0, #4
176 veor q10, q10, q5
177 veor q11, q11, q4
178 vand q10, q10, q8
179 vand q11, q11, q8
180 veor q5, q5, q10
181 vshl.u64 q10, q10, #4
182 veor q4, q4, q11
183 vshl.u64 q11, q11, #4
184 veor q1, q1, q10
185 veor q0, q0, q11
186 sub r5,r5,#1
187 b .Ldec_sbox
188.align 4
189.Ldec_loop:
190 vldmia r4!, {q8-q11}
191 veor q8, q8, q0
192 veor q9, q9, q1
193 vtbl.8 d0, {q8}, d24
194 vtbl.8 d1, {q8}, d25
195 vldmia r4!, {q8}
196 veor q10, q10, q2
197 vtbl.8 d2, {q9}, d24
198 vtbl.8 d3, {q9}, d25
199 vldmia r4!, {q9}
200 veor q11, q11, q3
201 vtbl.8 d4, {q10}, d24
202 vtbl.8 d5, {q10}, d25
203 vldmia r4!, {q10}
204 vtbl.8 d6, {q11}, d24
205 vtbl.8 d7, {q11}, d25
206 vldmia r4!, {q11}
207 veor q8, q8, q4
208 veor q9, q9, q5
209 vtbl.8 d8, {q8}, d24
210 vtbl.8 d9, {q8}, d25
211 veor q10, q10, q6
212 vtbl.8 d10, {q9}, d24
213 vtbl.8 d11, {q9}, d25
214 veor q11, q11, q7
215 vtbl.8 d12, {q10}, d24
216 vtbl.8 d13, {q10}, d25
217 vtbl.8 d14, {q11}, d24
218 vtbl.8 d15, {q11}, d25
219.Ldec_sbox:
220 veor q1, q1, q4
221 veor q3, q3, q4
222
223 veor q4, q4, q7
224 veor q1, q1, q6
225 veor q2, q2, q7
226 veor q6, q6, q4
227
228 veor q0, q0, q1
229 veor q2, q2, q5
230 veor q7, q7, q6
231 veor q3, q3, q0
232 veor q5, q5, q0
233 veor q1, q1, q3
234 veor q11, q3, q0
235 veor q10, q7, q4
236 veor q9, q1, q6
237 veor q13, q4, q0
238 vmov q8, q10
239 veor q12, q5, q2
240
241 vorr q10, q10, q9
242 veor q15, q11, q8
243 vand q14, q11, q12
244 vorr q11, q11, q12
245 veor q12, q12, q9
246 vand q8, q8, q9
247 veor q9, q6, q2
248 vand q15, q15, q12
249 vand q13, q13, q9
250 veor q9, q3, q7
251 veor q12, q1, q5
252 veor q11, q11, q13
253 veor q10, q10, q13
254 vand q13, q9, q12
255 vorr q9, q9, q12
256 veor q11, q11, q15
257 veor q8, q8, q13
258 veor q10, q10, q14
259 veor q9, q9, q15
260 veor q8, q8, q14
261 vand q12, q4, q6
262 veor q9, q9, q14
263 vand q13, q0, q2
264 vand q14, q7, q1
265 vorr q15, q3, q5
266 veor q11, q11, q12
267 veor q9, q9, q14
268 veor q8, q8, q15
269 veor q10, q10, q13
270
271 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
272
273 @ new smaller inversion
274
275 vand q14, q11, q9
276 vmov q12, q8
277
278 veor q13, q10, q14
279 veor q15, q8, q14
280 veor q14, q8, q14 @ q14=q15
281
282 vbsl q13, q9, q8
283 vbsl q15, q11, q10
284 veor q11, q11, q10
285
286 vbsl q12, q13, q14
287 vbsl q8, q14, q13
288
289 vand q14, q12, q15
290 veor q9, q9, q8
291
292 veor q14, q14, q11
293 veor q12, q5, q2
294 veor q8, q1, q6
295 veor q10, q15, q14
296 vand q10, q10, q5
297 veor q5, q5, q1
298 vand q11, q1, q15
299 vand q5, q5, q14
300 veor q1, q11, q10
301 veor q5, q5, q11
302 veor q15, q15, q13
303 veor q14, q14, q9
304 veor q11, q15, q14
305 veor q10, q13, q9
306 vand q11, q11, q12
307 vand q10, q10, q2
308 veor q12, q12, q8
309 veor q2, q2, q6
310 vand q8, q8, q15
311 vand q6, q6, q13
312 vand q12, q12, q14
313 vand q2, q2, q9
314 veor q8, q8, q12
315 veor q2, q2, q6
316 veor q12, q12, q11
317 veor q6, q6, q10
318 veor q5, q5, q12
319 veor q2, q2, q12
320 veor q1, q1, q8
321 veor q6, q6, q8
322
323 veor q12, q3, q0
324 veor q8, q7, q4
325 veor q11, q15, q14
326 veor q10, q13, q9
327 vand q11, q11, q12
328 vand q10, q10, q0
329 veor q12, q12, q8
330 veor q0, q0, q4
331 vand q8, q8, q15
332 vand q4, q4, q13
333 vand q12, q12, q14
334 vand q0, q0, q9
335 veor q8, q8, q12
336 veor q0, q0, q4
337 veor q12, q12, q11
338 veor q4, q4, q10
339 veor q15, q15, q13
340 veor q14, q14, q9
341 veor q10, q15, q14
342 vand q10, q10, q3
343 veor q3, q3, q7
344 vand q11, q7, q15
345 vand q3, q3, q14
346 veor q7, q11, q10
347 veor q3, q3, q11
348 veor q3, q3, q12
349 veor q0, q0, q12
350 veor q7, q7, q8
351 veor q4, q4, q8
352 veor q1, q1, q7
353 veor q6, q6, q5
354
355 veor q4, q4, q1
356 veor q2, q2, q7
357 veor q5, q5, q7
358 veor q4, q4, q2
359 veor q7, q7, q0
360 veor q4, q4, q5
361 veor q3, q3, q6
362 veor q6, q6, q1
363 veor q3, q3, q4
364
365 veor q4, q4, q0
366 veor q7, q7, q3
367 subs r5,r5,#1
368 bcc .Ldec_done
369 @ multiplication by 0x05-0x00-0x04-0x00
370 vext.8 q8, q0, q0, #8
371 vext.8 q14, q3, q3, #8
372 vext.8 q15, q5, q5, #8
373 veor q8, q8, q0
374 vext.8 q9, q1, q1, #8
375 veor q14, q14, q3
376 vext.8 q10, q6, q6, #8
377 veor q15, q15, q5
378 vext.8 q11, q4, q4, #8
379 veor q9, q9, q1
380 vext.8 q12, q2, q2, #8
381 veor q10, q10, q6
382 vext.8 q13, q7, q7, #8
383 veor q11, q11, q4
384 veor q12, q12, q2
385 veor q13, q13, q7
386
387 veor q0, q0, q14
388 veor q1, q1, q14
389 veor q6, q6, q8
390 veor q2, q2, q10
391 veor q4, q4, q9
392 veor q1, q1, q15
393 veor q6, q6, q15
394 veor q2, q2, q14
395 veor q7, q7, q11
396 veor q4, q4, q14
397 veor q3, q3, q12
398 veor q2, q2, q15
399 veor q7, q7, q15
400 veor q5, q5, q13
401 vext.8 q8, q0, q0, #12 @ x0 <<< 32
402 vext.8 q9, q1, q1, #12
403 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
404 vext.8 q10, q6, q6, #12
405 veor q1, q1, q9
406 vext.8 q11, q4, q4, #12
407 veor q6, q6, q10
408 vext.8 q12, q2, q2, #12
409 veor q4, q4, q11
410 vext.8 q13, q7, q7, #12
411 veor q2, q2, q12
412 vext.8 q14, q3, q3, #12
413 veor q7, q7, q13
414 vext.8 q15, q5, q5, #12
415 veor q3, q3, q14
416
417 veor q9, q9, q0
418 veor q5, q5, q15
419 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
420 veor q10, q10, q1
421 veor q8, q8, q5
422 veor q9, q9, q5
423 vext.8 q1, q1, q1, #8
424 veor q13, q13, q2
425 veor q0, q0, q8
426 veor q14, q14, q7
427 veor q1, q1, q9
428 vext.8 q8, q2, q2, #8
429 veor q12, q12, q4
430 vext.8 q9, q7, q7, #8
431 veor q15, q15, q3
432 vext.8 q2, q4, q4, #8
433 veor q11, q11, q6
434 vext.8 q7, q5, q5, #8
435 veor q12, q12, q5
436 vext.8 q4, q3, q3, #8
437 veor q11, q11, q5
438 vext.8 q3, q6, q6, #8
439 veor q5, q9, q13
440 veor q11, q11, q2
441 veor q7, q7, q15
442 veor q6, q4, q14
443 veor q4, q8, q12
444 veor q2, q3, q10
445 vmov q3, q11
446 @ vmov q5, q9
447 vldmia r6, {q12} @ .LISR
448 ite eq @ Thumb2 thing, sanity check in ARM
449 addeq r6,r6,#0x10
450 bne .Ldec_loop
451 vldmia r6, {q12} @ .LISRM0
452 b .Ldec_loop
453.align 4
454.Ldec_done:
455 vmov.i8 q8,#0x55 @ compose .LBS0
456 vmov.i8 q9,#0x33 @ compose .LBS1
457 vshr.u64 q10, q3, #1
458 vshr.u64 q11, q2, #1
459 veor q10, q10, q5
460 veor q11, q11, q7
461 vand q10, q10, q8
462 vand q11, q11, q8
463 veor q5, q5, q10
464 vshl.u64 q10, q10, #1
465 veor q7, q7, q11
466 vshl.u64 q11, q11, #1
467 veor q3, q3, q10
468 veor q2, q2, q11
469 vshr.u64 q10, q6, #1
470 vshr.u64 q11, q0, #1
471 veor q10, q10, q4
472 veor q11, q11, q1
473 vand q10, q10, q8
474 vand q11, q11, q8
475 veor q4, q4, q10
476 vshl.u64 q10, q10, #1
477 veor q1, q1, q11
478 vshl.u64 q11, q11, #1
479 veor q6, q6, q10
480 veor q0, q0, q11
481 vmov.i8 q8,#0x0f @ compose .LBS2
482 vshr.u64 q10, q7, #2
483 vshr.u64 q11, q2, #2
484 veor q10, q10, q5
485 veor q11, q11, q3
486 vand q10, q10, q9
487 vand q11, q11, q9
488 veor q5, q5, q10
489 vshl.u64 q10, q10, #2
490 veor q3, q3, q11
491 vshl.u64 q11, q11, #2
492 veor q7, q7, q10
493 veor q2, q2, q11
494 vshr.u64 q10, q1, #2
495 vshr.u64 q11, q0, #2
496 veor q10, q10, q4
497 veor q11, q11, q6
498 vand q10, q10, q9
499 vand q11, q11, q9
500 veor q4, q4, q10
501 vshl.u64 q10, q10, #2
502 veor q6, q6, q11
503 vshl.u64 q11, q11, #2
504 veor q1, q1, q10
505 veor q0, q0, q11
506 vshr.u64 q10, q4, #4
507 vshr.u64 q11, q6, #4
508 veor q10, q10, q5
509 veor q11, q11, q3
510 vand q10, q10, q8
511 vand q11, q11, q8
512 veor q5, q5, q10
513 vshl.u64 q10, q10, #4
514 veor q3, q3, q11
515 vshl.u64 q11, q11, #4
516 veor q4, q4, q10
517 veor q6, q6, q11
518 vshr.u64 q10, q1, #4
519 vshr.u64 q11, q0, #4
520 veor q10, q10, q7
521 veor q11, q11, q2
522 vand q10, q10, q8
523 vand q11, q11, q8
524 veor q7, q7, q10
525 vshl.u64 q10, q10, #4
526 veor q2, q2, q11
527 vshl.u64 q11, q11, #4
528 veor q1, q1, q10
529 veor q0, q0, q11
530 vldmia r4, {q8} @ last round key
531 veor q6, q6, q8
532 veor q4, q4, q8
533 veor q2, q2, q8
534 veor q7, q7, q8
535 veor q3, q3, q8
536 veor q5, q5, q8
537 veor q0, q0, q8
538 veor q1, q1, q8
539 bx lr
540.size _bsaes_decrypt8,.-_bsaes_decrypt8
541
542.type _bsaes_const,%object
543.align 6
544_bsaes_const:
545.LM0ISR: @ InvShiftRows constants
546 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
547.LISR:
548 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
549.LISRM0:
550 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
551.LM0SR: @ ShiftRows constants
552 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
553.LSR:
554 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
555.LSRM0:
556 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
557.LM0:
558 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
559.LREVM0SR:
560 .quad 0x090d01050c000408, 0x03070b0f060a0e02
561.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
562.align 6
563.size _bsaes_const,.-_bsaes_const
564
565.type _bsaes_encrypt8,%function
566.align 4
567_bsaes_encrypt8:
568 adr r6,_bsaes_encrypt8
569 vldmia r4!, {q9} @ round 0 key
570 sub r6,r6,#_bsaes_encrypt8-.LM0SR
571
572 vldmia r6!, {q8} @ .LM0SR
573_bsaes_encrypt8_alt:
574 veor q10, q0, q9 @ xor with round0 key
575 veor q11, q1, q9
576 vtbl.8 d0, {q10}, d16
577 vtbl.8 d1, {q10}, d17
578 veor q12, q2, q9
579 vtbl.8 d2, {q11}, d16
580 vtbl.8 d3, {q11}, d17
581 veor q13, q3, q9
582 vtbl.8 d4, {q12}, d16
583 vtbl.8 d5, {q12}, d17
584 veor q14, q4, q9
585 vtbl.8 d6, {q13}, d16
586 vtbl.8 d7, {q13}, d17
587 veor q15, q5, q9
588 vtbl.8 d8, {q14}, d16
589 vtbl.8 d9, {q14}, d17
590 veor q10, q6, q9
591 vtbl.8 d10, {q15}, d16
592 vtbl.8 d11, {q15}, d17
593 veor q11, q7, q9
594 vtbl.8 d12, {q10}, d16
595 vtbl.8 d13, {q10}, d17
596 vtbl.8 d14, {q11}, d16
597 vtbl.8 d15, {q11}, d17
598_bsaes_encrypt8_bitslice:
599 vmov.i8 q8,#0x55 @ compose .LBS0
600 vmov.i8 q9,#0x33 @ compose .LBS1
601 vshr.u64 q10, q6, #1
602 vshr.u64 q11, q4, #1
603 veor q10, q10, q7
604 veor q11, q11, q5
605 vand q10, q10, q8
606 vand q11, q11, q8
607 veor q7, q7, q10
608 vshl.u64 q10, q10, #1
609 veor q5, q5, q11
610 vshl.u64 q11, q11, #1
611 veor q6, q6, q10
612 veor q4, q4, q11
613 vshr.u64 q10, q2, #1
614 vshr.u64 q11, q0, #1
615 veor q10, q10, q3
616 veor q11, q11, q1
617 vand q10, q10, q8
618 vand q11, q11, q8
619 veor q3, q3, q10
620 vshl.u64 q10, q10, #1
621 veor q1, q1, q11
622 vshl.u64 q11, q11, #1
623 veor q2, q2, q10
624 veor q0, q0, q11
625 vmov.i8 q8,#0x0f @ compose .LBS2
626 vshr.u64 q10, q5, #2
627 vshr.u64 q11, q4, #2
628 veor q10, q10, q7
629 veor q11, q11, q6
630 vand q10, q10, q9
631 vand q11, q11, q9
632 veor q7, q7, q10
633 vshl.u64 q10, q10, #2
634 veor q6, q6, q11
635 vshl.u64 q11, q11, #2
636 veor q5, q5, q10
637 veor q4, q4, q11
638 vshr.u64 q10, q1, #2
639 vshr.u64 q11, q0, #2
640 veor q10, q10, q3
641 veor q11, q11, q2
642 vand q10, q10, q9
643 vand q11, q11, q9
644 veor q3, q3, q10
645 vshl.u64 q10, q10, #2
646 veor q2, q2, q11
647 vshl.u64 q11, q11, #2
648 veor q1, q1, q10
649 veor q0, q0, q11
650 vshr.u64 q10, q3, #4
651 vshr.u64 q11, q2, #4
652 veor q10, q10, q7
653 veor q11, q11, q6
654 vand q10, q10, q8
655 vand q11, q11, q8
656 veor q7, q7, q10
657 vshl.u64 q10, q10, #4
658 veor q6, q6, q11
659 vshl.u64 q11, q11, #4
660 veor q3, q3, q10
661 veor q2, q2, q11
662 vshr.u64 q10, q1, #4
663 vshr.u64 q11, q0, #4
664 veor q10, q10, q5
665 veor q11, q11, q4
666 vand q10, q10, q8
667 vand q11, q11, q8
668 veor q5, q5, q10
669 vshl.u64 q10, q10, #4
670 veor q4, q4, q11
671 vshl.u64 q11, q11, #4
672 veor q1, q1, q10
673 veor q0, q0, q11
674 sub r5,r5,#1
675 b .Lenc_sbox
676.align 4
677.Lenc_loop:
678 vldmia r4!, {q8-q11}
679 veor q8, q8, q0
680 veor q9, q9, q1
681 vtbl.8 d0, {q8}, d24
682 vtbl.8 d1, {q8}, d25
683 vldmia r4!, {q8}
684 veor q10, q10, q2
685 vtbl.8 d2, {q9}, d24
686 vtbl.8 d3, {q9}, d25
687 vldmia r4!, {q9}
688 veor q11, q11, q3
689 vtbl.8 d4, {q10}, d24
690 vtbl.8 d5, {q10}, d25
691 vldmia r4!, {q10}
692 vtbl.8 d6, {q11}, d24
693 vtbl.8 d7, {q11}, d25
694 vldmia r4!, {q11}
695 veor q8, q8, q4
696 veor q9, q9, q5
697 vtbl.8 d8, {q8}, d24
698 vtbl.8 d9, {q8}, d25
699 veor q10, q10, q6
700 vtbl.8 d10, {q9}, d24
701 vtbl.8 d11, {q9}, d25
702 veor q11, q11, q7
703 vtbl.8 d12, {q10}, d24
704 vtbl.8 d13, {q10}, d25
705 vtbl.8 d14, {q11}, d24
706 vtbl.8 d15, {q11}, d25
707.Lenc_sbox:
708 veor q2, q2, q1
709 veor q5, q5, q6
710 veor q3, q3, q0
711 veor q6, q6, q2
712 veor q5, q5, q0
713
714 veor q6, q6, q3
715 veor q3, q3, q7
716 veor q7, q7, q5
717 veor q3, q3, q4
718 veor q4, q4, q5
719
720 veor q2, q2, q7
721 veor q3, q3, q1
722 veor q1, q1, q5
723 veor q11, q7, q4
724 veor q10, q1, q2
725 veor q9, q5, q3
726 veor q13, q2, q4
727 vmov q8, q10
728 veor q12, q6, q0
729
730 vorr q10, q10, q9
731 veor q15, q11, q8
732 vand q14, q11, q12
733 vorr q11, q11, q12
734 veor q12, q12, q9
735 vand q8, q8, q9
736 veor q9, q3, q0
737 vand q15, q15, q12
738 vand q13, q13, q9
739 veor q9, q7, q1
740 veor q12, q5, q6
741 veor q11, q11, q13
742 veor q10, q10, q13
743 vand q13, q9, q12
744 vorr q9, q9, q12
745 veor q11, q11, q15
746 veor q8, q8, q13
747 veor q10, q10, q14
748 veor q9, q9, q15
749 veor q8, q8, q14
750 vand q12, q2, q3
751 veor q9, q9, q14
752 vand q13, q4, q0
753 vand q14, q1, q5
754 vorr q15, q7, q6
755 veor q11, q11, q12
756 veor q9, q9, q14
757 veor q8, q8, q15
758 veor q10, q10, q13
759
760 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
761
762 @ new smaller inversion
763
764 vand q14, q11, q9
765 vmov q12, q8
766
767 veor q13, q10, q14
768 veor q15, q8, q14
769 veor q14, q8, q14 @ q14=q15
770
771 vbsl q13, q9, q8
772 vbsl q15, q11, q10
773 veor q11, q11, q10
774
775 vbsl q12, q13, q14
776 vbsl q8, q14, q13
777
778 vand q14, q12, q15
779 veor q9, q9, q8
780
781 veor q14, q14, q11
782 veor q12, q6, q0
783 veor q8, q5, q3
784 veor q10, q15, q14
785 vand q10, q10, q6
786 veor q6, q6, q5
787 vand q11, q5, q15
788 vand q6, q6, q14
789 veor q5, q11, q10
790 veor q6, q6, q11
791 veor q15, q15, q13
792 veor q14, q14, q9
793 veor q11, q15, q14
794 veor q10, q13, q9
795 vand q11, q11, q12
796 vand q10, q10, q0
797 veor q12, q12, q8
798 veor q0, q0, q3
799 vand q8, q8, q15
800 vand q3, q3, q13
801 vand q12, q12, q14
802 vand q0, q0, q9
803 veor q8, q8, q12
804 veor q0, q0, q3
805 veor q12, q12, q11
806 veor q3, q3, q10
807 veor q6, q6, q12
808 veor q0, q0, q12
809 veor q5, q5, q8
810 veor q3, q3, q8
811
812 veor q12, q7, q4
813 veor q8, q1, q2
814 veor q11, q15, q14
815 veor q10, q13, q9
816 vand q11, q11, q12
817 vand q10, q10, q4
818 veor q12, q12, q8
819 veor q4, q4, q2
820 vand q8, q8, q15
821 vand q2, q2, q13
822 vand q12, q12, q14
823 vand q4, q4, q9
824 veor q8, q8, q12
825 veor q4, q4, q2
826 veor q12, q12, q11
827 veor q2, q2, q10
828 veor q15, q15, q13
829 veor q14, q14, q9
830 veor q10, q15, q14
831 vand q10, q10, q7
832 veor q7, q7, q1
833 vand q11, q1, q15
834 vand q7, q7, q14
835 veor q1, q11, q10
836 veor q7, q7, q11
837 veor q7, q7, q12
838 veor q4, q4, q12
839 veor q1, q1, q8
840 veor q2, q2, q8
841 veor q7, q7, q0
842 veor q1, q1, q6
843 veor q6, q6, q0
844 veor q4, q4, q7
845 veor q0, q0, q1
846
847 veor q1, q1, q5
848 veor q5, q5, q2
849 veor q2, q2, q3
850 veor q3, q3, q5
851 veor q4, q4, q5
852
853 veor q6, q6, q3
854 subs r5,r5,#1
855 bcc .Lenc_done
856 vext.8 q8, q0, q0, #12 @ x0 <<< 32
857 vext.8 q9, q1, q1, #12
858 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
859 vext.8 q10, q4, q4, #12
860 veor q1, q1, q9
861 vext.8 q11, q6, q6, #12
862 veor q4, q4, q10
863 vext.8 q12, q3, q3, #12
864 veor q6, q6, q11
865 vext.8 q13, q7, q7, #12
866 veor q3, q3, q12
867 vext.8 q14, q2, q2, #12
868 veor q7, q7, q13
869 vext.8 q15, q5, q5, #12
870 veor q2, q2, q14
871
872 veor q9, q9, q0
873 veor q5, q5, q15
874 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
875 veor q10, q10, q1
876 veor q8, q8, q5
877 veor q9, q9, q5
878 vext.8 q1, q1, q1, #8
879 veor q13, q13, q3
880 veor q0, q0, q8
881 veor q14, q14, q7
882 veor q1, q1, q9
883 vext.8 q8, q3, q3, #8
884 veor q12, q12, q6
885 vext.8 q9, q7, q7, #8
886 veor q15, q15, q2
887 vext.8 q3, q6, q6, #8
888 veor q11, q11, q4
889 vext.8 q7, q5, q5, #8
890 veor q12, q12, q5
891 vext.8 q6, q2, q2, #8
892 veor q11, q11, q5
893 vext.8 q2, q4, q4, #8
894 veor q5, q9, q13
895 veor q4, q8, q12
896 veor q3, q3, q11
897 veor q7, q7, q15
898 veor q6, q6, q14
899 @ vmov q4, q8
900 veor q2, q2, q10
901 @ vmov q5, q9
902 vldmia r6, {q12} @ .LSR
903 ite eq @ Thumb2 thing, samity check in ARM
904 addeq r6,r6,#0x10
905 bne .Lenc_loop
906 vldmia r6, {q12} @ .LSRM0
907 b .Lenc_loop
908.align 4
909.Lenc_done:
910 vmov.i8 q8,#0x55 @ compose .LBS0
911 vmov.i8 q9,#0x33 @ compose .LBS1
912 vshr.u64 q10, q2, #1
913 vshr.u64 q11, q3, #1
914 veor q10, q10, q5
915 veor q11, q11, q7
916 vand q10, q10, q8
917 vand q11, q11, q8
918 veor q5, q5, q10
919 vshl.u64 q10, q10, #1
920 veor q7, q7, q11
921 vshl.u64 q11, q11, #1
922 veor q2, q2, q10
923 veor q3, q3, q11
924 vshr.u64 q10, q4, #1
925 vshr.u64 q11, q0, #1
926 veor q10, q10, q6
927 veor q11, q11, q1
928 vand q10, q10, q8
929 vand q11, q11, q8
930 veor q6, q6, q10
931 vshl.u64 q10, q10, #1
932 veor q1, q1, q11
933 vshl.u64 q11, q11, #1
934 veor q4, q4, q10
935 veor q0, q0, q11
936 vmov.i8 q8,#0x0f @ compose .LBS2
937 vshr.u64 q10, q7, #2
938 vshr.u64 q11, q3, #2
939 veor q10, q10, q5
940 veor q11, q11, q2
941 vand q10, q10, q9
942 vand q11, q11, q9
943 veor q5, q5, q10
944 vshl.u64 q10, q10, #2
945 veor q2, q2, q11
946 vshl.u64 q11, q11, #2
947 veor q7, q7, q10
948 veor q3, q3, q11
949 vshr.u64 q10, q1, #2
950 vshr.u64 q11, q0, #2
951 veor q10, q10, q6
952 veor q11, q11, q4
953 vand q10, q10, q9
954 vand q11, q11, q9
955 veor q6, q6, q10
956 vshl.u64 q10, q10, #2
957 veor q4, q4, q11
958 vshl.u64 q11, q11, #2
959 veor q1, q1, q10
960 veor q0, q0, q11
961 vshr.u64 q10, q6, #4
962 vshr.u64 q11, q4, #4
963 veor q10, q10, q5
964 veor q11, q11, q2
965 vand q10, q10, q8
966 vand q11, q11, q8
967 veor q5, q5, q10
968 vshl.u64 q10, q10, #4
969 veor q2, q2, q11
970 vshl.u64 q11, q11, #4
971 veor q6, q6, q10
972 veor q4, q4, q11
973 vshr.u64 q10, q1, #4
974 vshr.u64 q11, q0, #4
975 veor q10, q10, q7
976 veor q11, q11, q3
977 vand q10, q10, q8
978 vand q11, q11, q8
979 veor q7, q7, q10
980 vshl.u64 q10, q10, #4
981 veor q3, q3, q11
982 vshl.u64 q11, q11, #4
983 veor q1, q1, q10
984 veor q0, q0, q11
985 vldmia r4, {q8} @ last round key
986 veor q4, q4, q8
987 veor q6, q6, q8
988 veor q3, q3, q8
989 veor q7, q7, q8
990 veor q2, q2, q8
991 veor q5, q5, q8
992 veor q0, q0, q8
993 veor q1, q1, q8
994 bx lr
995.size _bsaes_encrypt8,.-_bsaes_encrypt8
996.type _bsaes_key_convert,%function
997.align 4
998_bsaes_key_convert:
999 adr r6,_bsaes_key_convert
1000 vld1.8 {q7}, [r4]! @ load round 0 key
1001 sub r6,r6,#_bsaes_key_convert-.LM0
1002 vld1.8 {q15}, [r4]! @ load round 1 key
1003
1004 vmov.i8 q8, #0x01 @ bit masks
1005 vmov.i8 q9, #0x02
1006 vmov.i8 q10, #0x04
1007 vmov.i8 q11, #0x08
1008 vmov.i8 q12, #0x10
1009 vmov.i8 q13, #0x20
1010 vldmia r6, {q14} @ .LM0
1011
1012#ifdef __ARMEL__
1013 vrev32.8 q7, q7
1014 vrev32.8 q15, q15
1015#endif
1016 sub r5,r5,#1
1017 vstmia r12!, {q7} @ save round 0 key
1018 b .Lkey_loop
1019
1020.align 4
1021.Lkey_loop:
1022 vtbl.8 d14,{q15},d28
1023 vtbl.8 d15,{q15},d29
1024 vmov.i8 q6, #0x40
1025 vmov.i8 q15, #0x80
1026
1027 vtst.8 q0, q7, q8
1028 vtst.8 q1, q7, q9
1029 vtst.8 q2, q7, q10
1030 vtst.8 q3, q7, q11
1031 vtst.8 q4, q7, q12
1032 vtst.8 q5, q7, q13
1033 vtst.8 q6, q7, q6
1034 vtst.8 q7, q7, q15
1035 vld1.8 {q15}, [r4]! @ load next round key
1036 vmvn q0, q0 @ "pnot"
1037 vmvn q1, q1
1038 vmvn q5, q5
1039 vmvn q6, q6
1040#ifdef __ARMEL__
1041 vrev32.8 q15, q15
1042#endif
1043 subs r5,r5,#1
1044 vstmia r12!,{q0-q7} @ write bit-sliced round key
1045 bne .Lkey_loop
1046
1047 vmov.i8 q7,#0x63 @ compose .L63
1048 @ don't save last round key
1049 bx lr
1050.size _bsaes_key_convert,.-_bsaes_key_convert
1051.extern AES_cbc_encrypt
1052.extern AES_decrypt
1053
1054.global bsaes_cbc_encrypt
1055.type bsaes_cbc_encrypt,%function
1056.align 5
1057bsaes_cbc_encrypt:
1058#ifndef __KERNEL__
1059 cmp r2, #128
1060#ifndef __thumb__
1061 blo AES_cbc_encrypt
1062#else
1063 bhs 1f
1064 b AES_cbc_encrypt
10651:
1066#endif
1067#endif
1068
1069 @ it is up to the caller to make sure we are called with enc == 0
1070
1071 mov ip, sp
1072 stmdb sp!, {r4-r10, lr}
1073 VFP_ABI_PUSH
1074 ldr r8, [ip] @ IV is 1st arg on the stack
1075 mov r2, r2, lsr#4 @ len in 16 byte blocks
1076 sub sp, #0x10 @ scratch space to carry over the IV
1077 mov r9, sp @ save sp
1078
1079 ldr r10, [r3, #240] @ get # of rounds
1080#ifndef BSAES_ASM_EXTENDED_KEY
1081 @ allocate the key schedule on the stack
1082 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1083 add r12, #96 @ sifze of bit-slices key schedule
1084
1085 @ populate the key schedule
1086 mov r4, r3 @ pass key
1087 mov r5, r10 @ pass # of rounds
1088 mov sp, r12 @ sp is sp
1089 bl _bsaes_key_convert
1090 vldmia sp, {q6}
1091 vstmia r12, {q15} @ save last round key
1092 veor q7, q7, q6 @ fix up round 0 key
1093 vstmia sp, {q7}
1094#else
1095 ldr r12, [r3, #244]
1096 eors r12, #1
1097 beq 0f
1098
1099 @ populate the key schedule
1100 str r12, [r3, #244]
1101 mov r4, r3 @ pass key
1102 mov r5, r10 @ pass # of rounds
1103 add r12, r3, #248 @ pass key schedule
1104 bl _bsaes_key_convert
1105 add r4, r3, #248
1106 vldmia r4, {q6}
1107 vstmia r12, {q15} @ save last round key
1108 veor q7, q7, q6 @ fix up round 0 key
1109 vstmia r4, {q7}
1110
1111.align 2
11120:
1113#endif
1114
1115 vld1.8 {q15}, [r8] @ load IV
1116 b .Lcbc_dec_loop
1117
1118.align 4
1119.Lcbc_dec_loop:
1120 subs r2, r2, #0x8
1121 bmi .Lcbc_dec_loop_finish
1122
1123 vld1.8 {q0-q1}, [r0]! @ load input
1124 vld1.8 {q2-q3}, [r0]!
1125#ifndef BSAES_ASM_EXTENDED_KEY
1126 mov r4, sp @ pass the key
1127#else
1128 add r4, r3, #248
1129#endif
1130 vld1.8 {q4-q5}, [r0]!
1131 mov r5, r10
1132 vld1.8 {q6-q7}, [r0]
1133 sub r0, r0, #0x60
1134 vstmia r9, {q15} @ put aside IV
1135
1136 bl _bsaes_decrypt8
1137
1138 vldmia r9, {q14} @ reload IV
1139 vld1.8 {q8-q9}, [r0]! @ reload input
1140 veor q0, q0, q14 @ ^= IV
1141 vld1.8 {q10-q11}, [r0]!
1142 veor q1, q1, q8
1143 veor q6, q6, q9
1144 vld1.8 {q12-q13}, [r0]!
1145 veor q4, q4, q10
1146 veor q2, q2, q11
1147 vld1.8 {q14-q15}, [r0]!
1148 veor q7, q7, q12
1149 vst1.8 {q0-q1}, [r1]! @ write output
1150 veor q3, q3, q13
1151 vst1.8 {q6}, [r1]!
1152 veor q5, q5, q14
1153 vst1.8 {q4}, [r1]!
1154 vst1.8 {q2}, [r1]!
1155 vst1.8 {q7}, [r1]!
1156 vst1.8 {q3}, [r1]!
1157 vst1.8 {q5}, [r1]!
1158
1159 b .Lcbc_dec_loop
1160
1161.Lcbc_dec_loop_finish:
1162 adds r2, r2, #8
1163 beq .Lcbc_dec_done
1164
1165 vld1.8 {q0}, [r0]! @ load input
1166 cmp r2, #2
1167 blo .Lcbc_dec_one
1168 vld1.8 {q1}, [r0]!
1169#ifndef BSAES_ASM_EXTENDED_KEY
1170 mov r4, sp @ pass the key
1171#else
1172 add r4, r3, #248
1173#endif
1174 mov r5, r10
1175 vstmia r9, {q15} @ put aside IV
1176 beq .Lcbc_dec_two
1177 vld1.8 {q2}, [r0]!
1178 cmp r2, #4
1179 blo .Lcbc_dec_three
1180 vld1.8 {q3}, [r0]!
1181 beq .Lcbc_dec_four
1182 vld1.8 {q4}, [r0]!
1183 cmp r2, #6
1184 blo .Lcbc_dec_five
1185 vld1.8 {q5}, [r0]!
1186 beq .Lcbc_dec_six
1187 vld1.8 {q6}, [r0]!
1188 sub r0, r0, #0x70
1189
1190 bl _bsaes_decrypt8
1191
1192 vldmia r9, {q14} @ reload IV
1193 vld1.8 {q8-q9}, [r0]! @ reload input
1194 veor q0, q0, q14 @ ^= IV
1195 vld1.8 {q10-q11}, [r0]!
1196 veor q1, q1, q8
1197 veor q6, q6, q9
1198 vld1.8 {q12-q13}, [r0]!
1199 veor q4, q4, q10
1200 veor q2, q2, q11
1201 vld1.8 {q15}, [r0]!
1202 veor q7, q7, q12
1203 vst1.8 {q0-q1}, [r1]! @ write output
1204 veor q3, q3, q13
1205 vst1.8 {q6}, [r1]!
1206 vst1.8 {q4}, [r1]!
1207 vst1.8 {q2}, [r1]!
1208 vst1.8 {q7}, [r1]!
1209 vst1.8 {q3}, [r1]!
1210 b .Lcbc_dec_done
1211.align 4
1212.Lcbc_dec_six:
1213 sub r0, r0, #0x60
1214 bl _bsaes_decrypt8
1215 vldmia r9,{q14} @ reload IV
1216 vld1.8 {q8-q9}, [r0]! @ reload input
1217 veor q0, q0, q14 @ ^= IV
1218 vld1.8 {q10-q11}, [r0]!
1219 veor q1, q1, q8
1220 veor q6, q6, q9
1221 vld1.8 {q12}, [r0]!
1222 veor q4, q4, q10
1223 veor q2, q2, q11
1224 vld1.8 {q15}, [r0]!
1225 veor q7, q7, q12
1226 vst1.8 {q0-q1}, [r1]! @ write output
1227 vst1.8 {q6}, [r1]!
1228 vst1.8 {q4}, [r1]!
1229 vst1.8 {q2}, [r1]!
1230 vst1.8 {q7}, [r1]!
1231 b .Lcbc_dec_done
1232.align 4
1233.Lcbc_dec_five:
1234 sub r0, r0, #0x50
1235 bl _bsaes_decrypt8
1236 vldmia r9, {q14} @ reload IV
1237 vld1.8 {q8-q9}, [r0]! @ reload input
1238 veor q0, q0, q14 @ ^= IV
1239 vld1.8 {q10-q11}, [r0]!
1240 veor q1, q1, q8
1241 veor q6, q6, q9
1242 vld1.8 {q15}, [r0]!
1243 veor q4, q4, q10
1244 vst1.8 {q0-q1}, [r1]! @ write output
1245 veor q2, q2, q11
1246 vst1.8 {q6}, [r1]!
1247 vst1.8 {q4}, [r1]!
1248 vst1.8 {q2}, [r1]!
1249 b .Lcbc_dec_done
1250.align 4
1251.Lcbc_dec_four:
1252 sub r0, r0, #0x40
1253 bl _bsaes_decrypt8
1254 vldmia r9, {q14} @ reload IV
1255 vld1.8 {q8-q9}, [r0]! @ reload input
1256 veor q0, q0, q14 @ ^= IV
1257 vld1.8 {q10}, [r0]!
1258 veor q1, q1, q8
1259 veor q6, q6, q9
1260 vld1.8 {q15}, [r0]!
1261 veor q4, q4, q10
1262 vst1.8 {q0-q1}, [r1]! @ write output
1263 vst1.8 {q6}, [r1]!
1264 vst1.8 {q4}, [r1]!
1265 b .Lcbc_dec_done
1266.align 4
1267.Lcbc_dec_three:
1268 sub r0, r0, #0x30
1269 bl _bsaes_decrypt8
1270 vldmia r9, {q14} @ reload IV
1271 vld1.8 {q8-q9}, [r0]! @ reload input
1272 veor q0, q0, q14 @ ^= IV
1273 vld1.8 {q15}, [r0]!
1274 veor q1, q1, q8
1275 veor q6, q6, q9
1276 vst1.8 {q0-q1}, [r1]! @ write output
1277 vst1.8 {q6}, [r1]!
1278 b .Lcbc_dec_done
1279.align 4
1280.Lcbc_dec_two:
1281 sub r0, r0, #0x20
1282 bl _bsaes_decrypt8
1283 vldmia r9, {q14} @ reload IV
1284 vld1.8 {q8}, [r0]! @ reload input
1285 veor q0, q0, q14 @ ^= IV
1286 vld1.8 {q15}, [r0]! @ reload input
1287 veor q1, q1, q8
1288 vst1.8 {q0-q1}, [r1]! @ write output
1289 b .Lcbc_dec_done
1290.align 4
1291.Lcbc_dec_one:
1292 sub r0, r0, #0x10
1293 mov r10, r1 @ save original out pointer
1294 mov r1, r9 @ use the iv scratch space as out buffer
1295 mov r2, r3
1296 vmov q4,q15 @ just in case ensure that IV
1297 vmov q5,q0 @ and input are preserved
1298 bl AES_decrypt
1299 vld1.8 {q0}, [r9,:64] @ load result
1300 veor q0, q0, q4 @ ^= IV
1301 vmov q15, q5 @ q5 holds input
1302 vst1.8 {q0}, [r10] @ write output
1303
1304.Lcbc_dec_done:
1305#ifndef BSAES_ASM_EXTENDED_KEY
1306 vmov.i32 q0, #0
1307 vmov.i32 q1, #0
1308.Lcbc_dec_bzero: @ wipe key schedule [if any]
1309 vstmia sp!, {q0-q1}
1310 cmp sp, r9
1311 bne .Lcbc_dec_bzero
1312#endif
1313
1314 mov sp, r9
1315 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1316 vst1.8 {q15}, [r8] @ return IV
1317 VFP_ABI_POP
1318 ldmia sp!, {r4-r10, pc}
1319.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1320.extern AES_encrypt
1321.global bsaes_ctr32_encrypt_blocks
1322.type bsaes_ctr32_encrypt_blocks,%function
1323.align 5
1324bsaes_ctr32_encrypt_blocks:
1325 cmp r2, #8 @ use plain AES for
1326 blo .Lctr_enc_short @ small sizes
1327
1328 mov ip, sp
1329 stmdb sp!, {r4-r10, lr}
1330 VFP_ABI_PUSH
1331 ldr r8, [ip] @ ctr is 1st arg on the stack
1332 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1333 mov r9, sp @ save sp
1334
1335 ldr r10, [r3, #240] @ get # of rounds
1336#ifndef BSAES_ASM_EXTENDED_KEY
1337 @ allocate the key schedule on the stack
1338 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1339 add r12, #96 @ size of bit-sliced key schedule
1340
1341 @ populate the key schedule
1342 mov r4, r3 @ pass key
1343 mov r5, r10 @ pass # of rounds
1344 mov sp, r12 @ sp is sp
1345 bl _bsaes_key_convert
1346 veor q7,q7,q15 @ fix up last round key
1347 vstmia r12, {q7} @ save last round key
1348
1349 vld1.8 {q0}, [r8] @ load counter
1350 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1351 vldmia sp, {q4} @ load round0 key
1352#else
1353 ldr r12, [r3, #244]
1354 eors r12, #1
1355 beq 0f
1356
1357 @ populate the key schedule
1358 str r12, [r3, #244]
1359 mov r4, r3 @ pass key
1360 mov r5, r10 @ pass # of rounds
1361 add r12, r3, #248 @ pass key schedule
1362 bl _bsaes_key_convert
1363 veor q7,q7,q15 @ fix up last round key
1364 vstmia r12, {q7} @ save last round key
1365
1366.align 2
13670: add r12, r3, #248
1368 vld1.8 {q0}, [r8] @ load counter
1369 adrl r8, .LREVM0SR @ borrow r8
1370 vldmia r12, {q4} @ load round0 key
1371 sub sp, #0x10 @ place for adjusted round0 key
1372#endif
1373
1374 vmov.i32 q8,#1 @ compose 1<<96
1375 veor q9,q9,q9
1376 vrev32.8 q0,q0
1377 vext.8 q8,q9,q8,#4
1378 vrev32.8 q4,q4
1379 vadd.u32 q9,q8,q8 @ compose 2<<96
1380 vstmia sp, {q4} @ save adjusted round0 key
1381 b .Lctr_enc_loop
1382
1383.align 4
1384.Lctr_enc_loop:
1385 vadd.u32 q10, q8, q9 @ compose 3<<96
1386 vadd.u32 q1, q0, q8 @ +1
1387 vadd.u32 q2, q0, q9 @ +2
1388 vadd.u32 q3, q0, q10 @ +3
1389 vadd.u32 q4, q1, q10
1390 vadd.u32 q5, q2, q10
1391 vadd.u32 q6, q3, q10
1392 vadd.u32 q7, q4, q10
1393 vadd.u32 q10, q5, q10 @ next counter
1394
1395 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1396 @ to flip byte order in 32-bit counter
1397
1398 vldmia sp, {q9} @ load round0 key
1399#ifndef BSAES_ASM_EXTENDED_KEY
1400 add r4, sp, #0x10 @ pass next round key
1401#else
1402 add r4, r3, #264
1403#endif
1404 vldmia r8, {q8} @ .LREVM0SR
1405 mov r5, r10 @ pass rounds
1406 vstmia r9, {q10} @ save next counter
1407 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1408
1409 bl _bsaes_encrypt8_alt
1410
1411 subs r2, r2, #8
1412 blo .Lctr_enc_loop_done
1413
1414 vld1.8 {q8-q9}, [r0]! @ load input
1415 vld1.8 {q10-q11}, [r0]!
1416 veor q0, q8
1417 veor q1, q9
1418 vld1.8 {q12-q13}, [r0]!
1419 veor q4, q10
1420 veor q6, q11
1421 vld1.8 {q14-q15}, [r0]!
1422 veor q3, q12
1423 vst1.8 {q0-q1}, [r1]! @ write output
1424 veor q7, q13
1425 veor q2, q14
1426 vst1.8 {q4}, [r1]!
1427 veor q5, q15
1428 vst1.8 {q6}, [r1]!
1429 vmov.i32 q8, #1 @ compose 1<<96
1430 vst1.8 {q3}, [r1]!
1431 veor q9, q9, q9
1432 vst1.8 {q7}, [r1]!
1433 vext.8 q8, q9, q8, #4
1434 vst1.8 {q2}, [r1]!
1435 vadd.u32 q9,q8,q8 @ compose 2<<96
1436 vst1.8 {q5}, [r1]!
1437 vldmia r9, {q0} @ load counter
1438
1439 bne .Lctr_enc_loop
1440 b .Lctr_enc_done
1441
1442.align 4
1443.Lctr_enc_loop_done:
1444 add r2, r2, #8
1445 vld1.8 {q8}, [r0]! @ load input
1446 veor q0, q8
1447 vst1.8 {q0}, [r1]! @ write output
1448 cmp r2, #2
1449 blo .Lctr_enc_done
1450 vld1.8 {q9}, [r0]!
1451 veor q1, q9
1452 vst1.8 {q1}, [r1]!
1453 beq .Lctr_enc_done
1454 vld1.8 {q10}, [r0]!
1455 veor q4, q10
1456 vst1.8 {q4}, [r1]!
1457 cmp r2, #4
1458 blo .Lctr_enc_done
1459 vld1.8 {q11}, [r0]!
1460 veor q6, q11
1461 vst1.8 {q6}, [r1]!
1462 beq .Lctr_enc_done
1463 vld1.8 {q12}, [r0]!
1464 veor q3, q12
1465 vst1.8 {q3}, [r1]!
1466 cmp r2, #6
1467 blo .Lctr_enc_done
1468 vld1.8 {q13}, [r0]!
1469 veor q7, q13
1470 vst1.8 {q7}, [r1]!
1471 beq .Lctr_enc_done
1472 vld1.8 {q14}, [r0]
1473 veor q2, q14
1474 vst1.8 {q2}, [r1]!
1475
1476.Lctr_enc_done:
1477 vmov.i32 q0, #0
1478 vmov.i32 q1, #0
1479#ifndef BSAES_ASM_EXTENDED_KEY
1480.Lctr_enc_bzero: @ wipe key schedule [if any]
1481 vstmia sp!, {q0-q1}
1482 cmp sp, r9
1483 bne .Lctr_enc_bzero
1484#else
1485 vstmia sp, {q0-q1}
1486#endif
1487
1488 mov sp, r9
1489 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1490 VFP_ABI_POP
1491 ldmia sp!, {r4-r10, pc} @ return
1492
1493.align 4
1494.Lctr_enc_short:
1495 ldr ip, [sp] @ ctr pointer is passed on stack
1496 stmdb sp!, {r4-r8, lr}
1497
1498 mov r4, r0 @ copy arguments
1499 mov r5, r1
1500 mov r6, r2
1501 mov r7, r3
1502 ldr r8, [ip, #12] @ load counter LSW
1503 vld1.8 {q1}, [ip] @ load whole counter value
1504#ifdef __ARMEL__
1505 rev r8, r8
1506#endif
1507 sub sp, sp, #0x10
1508 vst1.8 {q1}, [sp,:64] @ copy counter value
1509 sub sp, sp, #0x10
1510
1511.Lctr_enc_short_loop:
1512 add r0, sp, #0x10 @ input counter value
1513 mov r1, sp @ output on the stack
1514 mov r2, r7 @ key
1515
1516 bl AES_encrypt
1517
1518 vld1.8 {q0}, [r4]! @ load input
1519 vld1.8 {q1}, [sp,:64] @ load encrypted counter
1520 add r8, r8, #1
1521#ifdef __ARMEL__
1522 rev r0, r8
1523 str r0, [sp, #0x1c] @ next counter value
1524#else
1525 str r8, [sp, #0x1c] @ next counter value
1526#endif
1527 veor q0,q0,q1
1528 vst1.8 {q0}, [r5]! @ store output
1529 subs r6, r6, #1
1530 bne .Lctr_enc_short_loop
1531
1532 vmov.i32 q0, #0
1533 vmov.i32 q1, #0
1534 vstmia sp!, {q0-q1}
1535
1536 ldmia sp!, {r4-r8, pc}
1537.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1538.globl bsaes_xts_encrypt
1539.type bsaes_xts_encrypt,%function
1540.align 4
1541bsaes_xts_encrypt:
1542 mov ip, sp
1543 stmdb sp!, {r4-r10, lr} @ 0x20
1544 VFP_ABI_PUSH
1545 mov r6, sp @ future r3
1546
1547 mov r7, r0
1548 mov r8, r1
1549 mov r9, r2
1550 mov r10, r3
1551
1552 sub r0, sp, #0x10 @ 0x10
1553 bic r0, #0xf @ align at 16 bytes
1554 mov sp, r0
1555
1556#ifdef XTS_CHAIN_TWEAK
1557 ldr r0, [ip] @ pointer to input tweak
1558#else
1559 @ generate initial tweak
1560 ldr r0, [ip, #4] @ iv[]
1561 mov r1, sp
1562 ldr r2, [ip, #0] @ key2
1563 bl AES_encrypt
1564 mov r0,sp @ pointer to initial tweak
1565#endif
1566
1567 ldr r1, [r10, #240] @ get # of rounds
1568 mov r3, r6
1569#ifndef BSAES_ASM_EXTENDED_KEY
1570 @ allocate the key schedule on the stack
1571 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1572 @ add r12, #96 @ size of bit-sliced key schedule
1573 sub r12, #48 @ place for tweak[9]
1574
1575 @ populate the key schedule
1576 mov r4, r10 @ pass key
1577 mov r5, r1 @ pass # of rounds
1578 mov sp, r12
1579 add r12, #0x90 @ pass key schedule
1580 bl _bsaes_key_convert
1581 veor q7, q7, q15 @ fix up last round key
1582 vstmia r12, {q7} @ save last round key
1583#else
1584 ldr r12, [r10, #244]
1585 eors r12, #1
1586 beq 0f
1587
1588 str r12, [r10, #244]
1589 mov r4, r10 @ pass key
1590 mov r5, r1 @ pass # of rounds
1591 add r12, r10, #248 @ pass key schedule
1592 bl _bsaes_key_convert
1593 veor q7, q7, q15 @ fix up last round key
1594 vstmia r12, {q7}
1595
1596.align 2
15970: sub sp, #0x90 @ place for tweak[9]
1598#endif
1599
1600 vld1.8 {q8}, [r0] @ initial tweak
1601 adr r2, .Lxts_magic
1602
1603 subs r9, #0x80
1604 blo .Lxts_enc_short
1605 b .Lxts_enc_loop
1606
1607.align 4
1608.Lxts_enc_loop:
1609 vldmia r2, {q5} @ load XTS magic
1610 vshr.s64 q6, q8, #63
1611 mov r0, sp
1612 vand q6, q6, q5
1613 vadd.u64 q9, q8, q8
1614 vst1.64 {q8}, [r0,:128]!
1615 vswp d13,d12
1616 vshr.s64 q7, q9, #63
1617 veor q9, q9, q6
1618 vand q7, q7, q5
1619 vadd.u64 q10, q9, q9
1620 vst1.64 {q9}, [r0,:128]!
1621 vswp d15,d14
1622 vshr.s64 q6, q10, #63
1623 veor q10, q10, q7
1624 vand q6, q6, q5
1625 vld1.8 {q0}, [r7]!
1626 vadd.u64 q11, q10, q10
1627 vst1.64 {q10}, [r0,:128]!
1628 vswp d13,d12
1629 vshr.s64 q7, q11, #63
1630 veor q11, q11, q6
1631 vand q7, q7, q5
1632 vld1.8 {q1}, [r7]!
1633 veor q0, q0, q8
1634 vadd.u64 q12, q11, q11
1635 vst1.64 {q11}, [r0,:128]!
1636 vswp d15,d14
1637 vshr.s64 q6, q12, #63
1638 veor q12, q12, q7
1639 vand q6, q6, q5
1640 vld1.8 {q2}, [r7]!
1641 veor q1, q1, q9
1642 vadd.u64 q13, q12, q12
1643 vst1.64 {q12}, [r0,:128]!
1644 vswp d13,d12
1645 vshr.s64 q7, q13, #63
1646 veor q13, q13, q6
1647 vand q7, q7, q5
1648 vld1.8 {q3}, [r7]!
1649 veor q2, q2, q10
1650 vadd.u64 q14, q13, q13
1651 vst1.64 {q13}, [r0,:128]!
1652 vswp d15,d14
1653 vshr.s64 q6, q14, #63
1654 veor q14, q14, q7
1655 vand q6, q6, q5
1656 vld1.8 {q4}, [r7]!
1657 veor q3, q3, q11
1658 vadd.u64 q15, q14, q14
1659 vst1.64 {q14}, [r0,:128]!
1660 vswp d13,d12
1661 vshr.s64 q7, q15, #63
1662 veor q15, q15, q6
1663 vand q7, q7, q5
1664 vld1.8 {q5}, [r7]!
1665 veor q4, q4, q12
1666 vadd.u64 q8, q15, q15
1667 vst1.64 {q15}, [r0,:128]!
1668 vswp d15,d14
1669 veor q8, q8, q7
1670 vst1.64 {q8}, [r0,:128] @ next round tweak
1671
1672 vld1.8 {q6-q7}, [r7]!
1673 veor q5, q5, q13
1674#ifndef BSAES_ASM_EXTENDED_KEY
1675 add r4, sp, #0x90 @ pass key schedule
1676#else
1677 add r4, r10, #248 @ pass key schedule
1678#endif
1679 veor q6, q6, q14
1680 mov r5, r1 @ pass rounds
1681 veor q7, q7, q15
1682 mov r0, sp
1683
1684 bl _bsaes_encrypt8
1685
1686 vld1.64 {q8-q9}, [r0,:128]!
1687 vld1.64 {q10-q11}, [r0,:128]!
1688 veor q0, q0, q8
1689 vld1.64 {q12-q13}, [r0,:128]!
1690 veor q1, q1, q9
1691 veor q8, q4, q10
1692 vst1.8 {q0-q1}, [r8]!
1693 veor q9, q6, q11
1694 vld1.64 {q14-q15}, [r0,:128]!
1695 veor q10, q3, q12
1696 vst1.8 {q8-q9}, [r8]!
1697 veor q11, q7, q13
1698 veor q12, q2, q14
1699 vst1.8 {q10-q11}, [r8]!
1700 veor q13, q5, q15
1701 vst1.8 {q12-q13}, [r8]!
1702
1703 vld1.64 {q8}, [r0,:128] @ next round tweak
1704
1705 subs r9, #0x80
1706 bpl .Lxts_enc_loop
1707
1708.Lxts_enc_short:
1709 adds r9, #0x70
1710 bmi .Lxts_enc_done
1711
1712 vldmia r2, {q5} @ load XTS magic
1713 vshr.s64 q7, q8, #63
1714 mov r0, sp
1715 vand q7, q7, q5
1716 vadd.u64 q9, q8, q8
1717 vst1.64 {q8}, [r0,:128]!
1718 vswp d15,d14
1719 vshr.s64 q6, q9, #63
1720 veor q9, q9, q7
1721 vand q6, q6, q5
1722 vadd.u64 q10, q9, q9
1723 vst1.64 {q9}, [r0,:128]!
1724 vswp d13,d12
1725 vshr.s64 q7, q10, #63
1726 veor q10, q10, q6
1727 vand q7, q7, q5
1728 vld1.8 {q0}, [r7]!
1729 subs r9, #0x10
1730 bmi .Lxts_enc_1
1731 vadd.u64 q11, q10, q10
1732 vst1.64 {q10}, [r0,:128]!
1733 vswp d15,d14
1734 vshr.s64 q6, q11, #63
1735 veor q11, q11, q7
1736 vand q6, q6, q5
1737 vld1.8 {q1}, [r7]!
1738 subs r9, #0x10
1739 bmi .Lxts_enc_2
1740 veor q0, q0, q8
1741 vadd.u64 q12, q11, q11
1742 vst1.64 {q11}, [r0,:128]!
1743 vswp d13,d12
1744 vshr.s64 q7, q12, #63
1745 veor q12, q12, q6
1746 vand q7, q7, q5
1747 vld1.8 {q2}, [r7]!
1748 subs r9, #0x10
1749 bmi .Lxts_enc_3
1750 veor q1, q1, q9
1751 vadd.u64 q13, q12, q12
1752 vst1.64 {q12}, [r0,:128]!
1753 vswp d15,d14
1754 vshr.s64 q6, q13, #63
1755 veor q13, q13, q7
1756 vand q6, q6, q5
1757 vld1.8 {q3}, [r7]!
1758 subs r9, #0x10
1759 bmi .Lxts_enc_4
1760 veor q2, q2, q10
1761 vadd.u64 q14, q13, q13
1762 vst1.64 {q13}, [r0,:128]!
1763 vswp d13,d12
1764 vshr.s64 q7, q14, #63
1765 veor q14, q14, q6
1766 vand q7, q7, q5
1767 vld1.8 {q4}, [r7]!
1768 subs r9, #0x10
1769 bmi .Lxts_enc_5
1770 veor q3, q3, q11
1771 vadd.u64 q15, q14, q14
1772 vst1.64 {q14}, [r0,:128]!
1773 vswp d15,d14
1774 vshr.s64 q6, q15, #63
1775 veor q15, q15, q7
1776 vand q6, q6, q5
1777 vld1.8 {q5}, [r7]!
1778 subs r9, #0x10
1779 bmi .Lxts_enc_6
1780 veor q4, q4, q12
1781 sub r9, #0x10
1782 vst1.64 {q15}, [r0,:128] @ next round tweak
1783
1784 vld1.8 {q6}, [r7]!
1785 veor q5, q5, q13
1786#ifndef BSAES_ASM_EXTENDED_KEY
1787 add r4, sp, #0x90 @ pass key schedule
1788#else
1789 add r4, r10, #248 @ pass key schedule
1790#endif
1791 veor q6, q6, q14
1792 mov r5, r1 @ pass rounds
1793 mov r0, sp
1794
1795 bl _bsaes_encrypt8
1796
1797 vld1.64 {q8-q9}, [r0,:128]!
1798 vld1.64 {q10-q11}, [r0,:128]!
1799 veor q0, q0, q8
1800 vld1.64 {q12-q13}, [r0,:128]!
1801 veor q1, q1, q9
1802 veor q8, q4, q10
1803 vst1.8 {q0-q1}, [r8]!
1804 veor q9, q6, q11
1805 vld1.64 {q14}, [r0,:128]!
1806 veor q10, q3, q12
1807 vst1.8 {q8-q9}, [r8]!
1808 veor q11, q7, q13
1809 veor q12, q2, q14
1810 vst1.8 {q10-q11}, [r8]!
1811 vst1.8 {q12}, [r8]!
1812
1813 vld1.64 {q8}, [r0,:128] @ next round tweak
1814 b .Lxts_enc_done
1815.align 4
1816.Lxts_enc_6:
1817 vst1.64 {q14}, [r0,:128] @ next round tweak
1818
1819 veor q4, q4, q12
1820#ifndef BSAES_ASM_EXTENDED_KEY
1821 add r4, sp, #0x90 @ pass key schedule
1822#else
1823 add r4, r10, #248 @ pass key schedule
1824#endif
1825 veor q5, q5, q13
1826 mov r5, r1 @ pass rounds
1827 mov r0, sp
1828
1829 bl _bsaes_encrypt8
1830
1831 vld1.64 {q8-q9}, [r0,:128]!
1832 vld1.64 {q10-q11}, [r0,:128]!
1833 veor q0, q0, q8
1834 vld1.64 {q12-q13}, [r0,:128]!
1835 veor q1, q1, q9
1836 veor q8, q4, q10
1837 vst1.8 {q0-q1}, [r8]!
1838 veor q9, q6, q11
1839 veor q10, q3, q12
1840 vst1.8 {q8-q9}, [r8]!
1841 veor q11, q7, q13
1842 vst1.8 {q10-q11}, [r8]!
1843
1844 vld1.64 {q8}, [r0,:128] @ next round tweak
1845 b .Lxts_enc_done
1846
1847@ put this in range for both ARM and Thumb mode adr instructions
1848.align 5
1849.Lxts_magic:
1850 .quad 1, 0x87
1851
1852.align 5
1853.Lxts_enc_5:
1854 vst1.64 {q13}, [r0,:128] @ next round tweak
1855
1856 veor q3, q3, q11
1857#ifndef BSAES_ASM_EXTENDED_KEY
1858 add r4, sp, #0x90 @ pass key schedule
1859#else
1860 add r4, r10, #248 @ pass key schedule
1861#endif
1862 veor q4, q4, q12
1863 mov r5, r1 @ pass rounds
1864 mov r0, sp
1865
1866 bl _bsaes_encrypt8
1867
1868 vld1.64 {q8-q9}, [r0,:128]!
1869 vld1.64 {q10-q11}, [r0,:128]!
1870 veor q0, q0, q8
1871 vld1.64 {q12}, [r0,:128]!
1872 veor q1, q1, q9
1873 veor q8, q4, q10
1874 vst1.8 {q0-q1}, [r8]!
1875 veor q9, q6, q11
1876 veor q10, q3, q12
1877 vst1.8 {q8-q9}, [r8]!
1878 vst1.8 {q10}, [r8]!
1879
1880 vld1.64 {q8}, [r0,:128] @ next round tweak
1881 b .Lxts_enc_done
1882.align 4
1883.Lxts_enc_4:
1884 vst1.64 {q12}, [r0,:128] @ next round tweak
1885
1886 veor q2, q2, q10
1887#ifndef BSAES_ASM_EXTENDED_KEY
1888 add r4, sp, #0x90 @ pass key schedule
1889#else
1890 add r4, r10, #248 @ pass key schedule
1891#endif
1892 veor q3, q3, q11
1893 mov r5, r1 @ pass rounds
1894 mov r0, sp
1895
1896 bl _bsaes_encrypt8
1897
1898 vld1.64 {q8-q9}, [r0,:128]!
1899 vld1.64 {q10-q11}, [r0,:128]!
1900 veor q0, q0, q8
1901 veor q1, q1, q9
1902 veor q8, q4, q10
1903 vst1.8 {q0-q1}, [r8]!
1904 veor q9, q6, q11
1905 vst1.8 {q8-q9}, [r8]!
1906
1907 vld1.64 {q8}, [r0,:128] @ next round tweak
1908 b .Lxts_enc_done
1909.align 4
1910.Lxts_enc_3:
1911 vst1.64 {q11}, [r0,:128] @ next round tweak
1912
1913 veor q1, q1, q9
1914#ifndef BSAES_ASM_EXTENDED_KEY
1915 add r4, sp, #0x90 @ pass key schedule
1916#else
1917 add r4, r10, #248 @ pass key schedule
1918#endif
1919 veor q2, q2, q10
1920 mov r5, r1 @ pass rounds
1921 mov r0, sp
1922
1923 bl _bsaes_encrypt8
1924
1925 vld1.64 {q8-q9}, [r0,:128]!
1926 vld1.64 {q10}, [r0,:128]!
1927 veor q0, q0, q8
1928 veor q1, q1, q9
1929 veor q8, q4, q10
1930 vst1.8 {q0-q1}, [r8]!
1931 vst1.8 {q8}, [r8]!
1932
1933 vld1.64 {q8}, [r0,:128] @ next round tweak
1934 b .Lxts_enc_done
1935.align 4
1936.Lxts_enc_2:
1937 vst1.64 {q10}, [r0,:128] @ next round tweak
1938
1939 veor q0, q0, q8
1940#ifndef BSAES_ASM_EXTENDED_KEY
1941 add r4, sp, #0x90 @ pass key schedule
1942#else
1943 add r4, r10, #248 @ pass key schedule
1944#endif
1945 veor q1, q1, q9
1946 mov r5, r1 @ pass rounds
1947 mov r0, sp
1948
1949 bl _bsaes_encrypt8
1950
1951 vld1.64 {q8-q9}, [r0,:128]!
1952 veor q0, q0, q8
1953 veor q1, q1, q9
1954 vst1.8 {q0-q1}, [r8]!
1955
1956 vld1.64 {q8}, [r0,:128] @ next round tweak
1957 b .Lxts_enc_done
1958.align 4
1959.Lxts_enc_1:
1960 mov r0, sp
1961 veor q0, q8
1962 mov r1, sp
1963 vst1.8 {q0}, [sp,:128]
1964 mov r2, r10
1965 mov r4, r3 @ preserve fp
1966
1967 bl AES_encrypt
1968
1969 vld1.8 {q0}, [sp,:128]
1970 veor q0, q0, q8
1971 vst1.8 {q0}, [r8]!
1972 mov r3, r4
1973
1974 vmov q8, q9 @ next round tweak
1975
1976.Lxts_enc_done:
1977#ifndef XTS_CHAIN_TWEAK
1978 adds r9, #0x10
1979 beq .Lxts_enc_ret
1980 sub r6, r8, #0x10
1981
1982.Lxts_enc_steal:
1983 ldrb r0, [r7], #1
1984 ldrb r1, [r8, #-0x10]
1985 strb r0, [r8, #-0x10]
1986 strb r1, [r8], #1
1987
1988 subs r9, #1
1989 bhi .Lxts_enc_steal
1990
1991 vld1.8 {q0}, [r6]
1992 mov r0, sp
1993 veor q0, q0, q8
1994 mov r1, sp
1995 vst1.8 {q0}, [sp,:128]
1996 mov r2, r10
1997 mov r4, r3 @ preserve fp
1998
1999 bl AES_encrypt
2000
2001 vld1.8 {q0}, [sp,:128]
2002 veor q0, q0, q8
2003 vst1.8 {q0}, [r6]
2004 mov r3, r4
2005#endif
2006
2007.Lxts_enc_ret:
2008 bic r0, r3, #0xf
2009 vmov.i32 q0, #0
2010 vmov.i32 q1, #0
2011#ifdef XTS_CHAIN_TWEAK
2012 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2013#endif
2014.Lxts_enc_bzero: @ wipe key schedule [if any]
2015 vstmia sp!, {q0-q1}
2016 cmp sp, r0
2017 bne .Lxts_enc_bzero
2018
2019 mov sp, r3
2020#ifdef XTS_CHAIN_TWEAK
2021 vst1.8 {q8}, [r1]
2022#endif
2023 VFP_ABI_POP
2024 ldmia sp!, {r4-r10, pc} @ return
2025
2026.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2027
2028.globl bsaes_xts_decrypt
2029.type bsaes_xts_decrypt,%function
2030.align 4
2031bsaes_xts_decrypt:
2032 mov ip, sp
2033 stmdb sp!, {r4-r10, lr} @ 0x20
2034 VFP_ABI_PUSH
2035 mov r6, sp @ future r3
2036
2037 mov r7, r0
2038 mov r8, r1
2039 mov r9, r2
2040 mov r10, r3
2041
2042 sub r0, sp, #0x10 @ 0x10
2043 bic r0, #0xf @ align at 16 bytes
2044 mov sp, r0
2045
2046#ifdef XTS_CHAIN_TWEAK
2047 ldr r0, [ip] @ pointer to input tweak
2048#else
2049 @ generate initial tweak
2050 ldr r0, [ip, #4] @ iv[]
2051 mov r1, sp
2052 ldr r2, [ip, #0] @ key2
2053 bl AES_encrypt
2054 mov r0, sp @ pointer to initial tweak
2055#endif
2056
2057 ldr r1, [r10, #240] @ get # of rounds
2058 mov r3, r6
2059#ifndef BSAES_ASM_EXTENDED_KEY
2060 @ allocate the key schedule on the stack
2061 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2062 @ add r12, #96 @ size of bit-sliced key schedule
2063 sub r12, #48 @ place for tweak[9]
2064
2065 @ populate the key schedule
2066 mov r4, r10 @ pass key
2067 mov r5, r1 @ pass # of rounds
2068 mov sp, r12
2069 add r12, #0x90 @ pass key schedule
2070 bl _bsaes_key_convert
2071 add r4, sp, #0x90
2072 vldmia r4, {q6}
2073 vstmia r12, {q15} @ save last round key
2074 veor q7, q7, q6 @ fix up round 0 key
2075 vstmia r4, {q7}
2076#else
2077 ldr r12, [r10, #244]
2078 eors r12, #1
2079 beq 0f
2080
2081 str r12, [r10, #244]
2082 mov r4, r10 @ pass key
2083 mov r5, r1 @ pass # of rounds
2084 add r12, r10, #248 @ pass key schedule
2085 bl _bsaes_key_convert
2086 add r4, r10, #248
2087 vldmia r4, {q6}
2088 vstmia r12, {q15} @ save last round key
2089 veor q7, q7, q6 @ fix up round 0 key
2090 vstmia r4, {q7}
2091
2092.align 2
20930: sub sp, #0x90 @ place for tweak[9]
2094#endif
2095 vld1.8 {q8}, [r0] @ initial tweak
2096 adr r2, .Lxts_magic
2097
2098 tst r9, #0xf @ if not multiple of 16
2099 it ne @ Thumb2 thing, sanity check in ARM
2100 subne r9, #0x10 @ subtract another 16 bytes
2101 subs r9, #0x80
2102
2103 blo .Lxts_dec_short
2104 b .Lxts_dec_loop
2105
2106.align 4
2107.Lxts_dec_loop:
2108 vldmia r2, {q5} @ load XTS magic
2109 vshr.s64 q6, q8, #63
2110 mov r0, sp
2111 vand q6, q6, q5
2112 vadd.u64 q9, q8, q8
2113 vst1.64 {q8}, [r0,:128]!
2114 vswp d13,d12
2115 vshr.s64 q7, q9, #63
2116 veor q9, q9, q6
2117 vand q7, q7, q5
2118 vadd.u64 q10, q9, q9
2119 vst1.64 {q9}, [r0,:128]!
2120 vswp d15,d14
2121 vshr.s64 q6, q10, #63
2122 veor q10, q10, q7
2123 vand q6, q6, q5
2124 vld1.8 {q0}, [r7]!
2125 vadd.u64 q11, q10, q10
2126 vst1.64 {q10}, [r0,:128]!
2127 vswp d13,d12
2128 vshr.s64 q7, q11, #63
2129 veor q11, q11, q6
2130 vand q7, q7, q5
2131 vld1.8 {q1}, [r7]!
2132 veor q0, q0, q8
2133 vadd.u64 q12, q11, q11
2134 vst1.64 {q11}, [r0,:128]!
2135 vswp d15,d14
2136 vshr.s64 q6, q12, #63
2137 veor q12, q12, q7
2138 vand q6, q6, q5
2139 vld1.8 {q2}, [r7]!
2140 veor q1, q1, q9
2141 vadd.u64 q13, q12, q12
2142 vst1.64 {q12}, [r0,:128]!
2143 vswp d13,d12
2144 vshr.s64 q7, q13, #63
2145 veor q13, q13, q6
2146 vand q7, q7, q5
2147 vld1.8 {q3}, [r7]!
2148 veor q2, q2, q10
2149 vadd.u64 q14, q13, q13
2150 vst1.64 {q13}, [r0,:128]!
2151 vswp d15,d14
2152 vshr.s64 q6, q14, #63
2153 veor q14, q14, q7
2154 vand q6, q6, q5
2155 vld1.8 {q4}, [r7]!
2156 veor q3, q3, q11
2157 vadd.u64 q15, q14, q14
2158 vst1.64 {q14}, [r0,:128]!
2159 vswp d13,d12
2160 vshr.s64 q7, q15, #63
2161 veor q15, q15, q6
2162 vand q7, q7, q5
2163 vld1.8 {q5}, [r7]!
2164 veor q4, q4, q12
2165 vadd.u64 q8, q15, q15
2166 vst1.64 {q15}, [r0,:128]!
2167 vswp d15,d14
2168 veor q8, q8, q7
2169 vst1.64 {q8}, [r0,:128] @ next round tweak
2170
2171 vld1.8 {q6-q7}, [r7]!
2172 veor q5, q5, q13
2173#ifndef BSAES_ASM_EXTENDED_KEY
2174 add r4, sp, #0x90 @ pass key schedule
2175#else
2176 add r4, r10, #248 @ pass key schedule
2177#endif
2178 veor q6, q6, q14
2179 mov r5, r1 @ pass rounds
2180 veor q7, q7, q15
2181 mov r0, sp
2182
2183 bl _bsaes_decrypt8
2184
2185 vld1.64 {q8-q9}, [r0,:128]!
2186 vld1.64 {q10-q11}, [r0,:128]!
2187 veor q0, q0, q8
2188 vld1.64 {q12-q13}, [r0,:128]!
2189 veor q1, q1, q9
2190 veor q8, q6, q10
2191 vst1.8 {q0-q1}, [r8]!
2192 veor q9, q4, q11
2193 vld1.64 {q14-q15}, [r0,:128]!
2194 veor q10, q2, q12
2195 vst1.8 {q8-q9}, [r8]!
2196 veor q11, q7, q13
2197 veor q12, q3, q14
2198 vst1.8 {q10-q11}, [r8]!
2199 veor q13, q5, q15
2200 vst1.8 {q12-q13}, [r8]!
2201
2202 vld1.64 {q8}, [r0,:128] @ next round tweak
2203
2204 subs r9, #0x80
2205 bpl .Lxts_dec_loop
2206
2207.Lxts_dec_short:
2208 adds r9, #0x70
2209 bmi .Lxts_dec_done
2210
2211 vldmia r2, {q5} @ load XTS magic
2212 vshr.s64 q7, q8, #63
2213 mov r0, sp
2214 vand q7, q7, q5
2215 vadd.u64 q9, q8, q8
2216 vst1.64 {q8}, [r0,:128]!
2217 vswp d15,d14
2218 vshr.s64 q6, q9, #63
2219 veor q9, q9, q7
2220 vand q6, q6, q5
2221 vadd.u64 q10, q9, q9
2222 vst1.64 {q9}, [r0,:128]!
2223 vswp d13,d12
2224 vshr.s64 q7, q10, #63
2225 veor q10, q10, q6
2226 vand q7, q7, q5
2227 vld1.8 {q0}, [r7]!
2228 subs r9, #0x10
2229 bmi .Lxts_dec_1
2230 vadd.u64 q11, q10, q10
2231 vst1.64 {q10}, [r0,:128]!
2232 vswp d15,d14
2233 vshr.s64 q6, q11, #63
2234 veor q11, q11, q7
2235 vand q6, q6, q5
2236 vld1.8 {q1}, [r7]!
2237 subs r9, #0x10
2238 bmi .Lxts_dec_2
2239 veor q0, q0, q8
2240 vadd.u64 q12, q11, q11
2241 vst1.64 {q11}, [r0,:128]!
2242 vswp d13,d12
2243 vshr.s64 q7, q12, #63
2244 veor q12, q12, q6
2245 vand q7, q7, q5
2246 vld1.8 {q2}, [r7]!
2247 subs r9, #0x10
2248 bmi .Lxts_dec_3
2249 veor q1, q1, q9
2250 vadd.u64 q13, q12, q12
2251 vst1.64 {q12}, [r0,:128]!
2252 vswp d15,d14
2253 vshr.s64 q6, q13, #63
2254 veor q13, q13, q7
2255 vand q6, q6, q5
2256 vld1.8 {q3}, [r7]!
2257 subs r9, #0x10
2258 bmi .Lxts_dec_4
2259 veor q2, q2, q10
2260 vadd.u64 q14, q13, q13
2261 vst1.64 {q13}, [r0,:128]!
2262 vswp d13,d12
2263 vshr.s64 q7, q14, #63
2264 veor q14, q14, q6
2265 vand q7, q7, q5
2266 vld1.8 {q4}, [r7]!
2267 subs r9, #0x10
2268 bmi .Lxts_dec_5
2269 veor q3, q3, q11
2270 vadd.u64 q15, q14, q14
2271 vst1.64 {q14}, [r0,:128]!
2272 vswp d15,d14
2273 vshr.s64 q6, q15, #63
2274 veor q15, q15, q7
2275 vand q6, q6, q5
2276 vld1.8 {q5}, [r7]!
2277 subs r9, #0x10
2278 bmi .Lxts_dec_6
2279 veor q4, q4, q12
2280 sub r9, #0x10
2281 vst1.64 {q15}, [r0,:128] @ next round tweak
2282
2283 vld1.8 {q6}, [r7]!
2284 veor q5, q5, q13
2285#ifndef BSAES_ASM_EXTENDED_KEY
2286 add r4, sp, #0x90 @ pass key schedule
2287#else
2288 add r4, r10, #248 @ pass key schedule
2289#endif
2290 veor q6, q6, q14
2291 mov r5, r1 @ pass rounds
2292 mov r0, sp
2293
2294 bl _bsaes_decrypt8
2295
2296 vld1.64 {q8-q9}, [r0,:128]!
2297 vld1.64 {q10-q11}, [r0,:128]!
2298 veor q0, q0, q8
2299 vld1.64 {q12-q13}, [r0,:128]!
2300 veor q1, q1, q9
2301 veor q8, q6, q10
2302 vst1.8 {q0-q1}, [r8]!
2303 veor q9, q4, q11
2304 vld1.64 {q14}, [r0,:128]!
2305 veor q10, q2, q12
2306 vst1.8 {q8-q9}, [r8]!
2307 veor q11, q7, q13
2308 veor q12, q3, q14
2309 vst1.8 {q10-q11}, [r8]!
2310 vst1.8 {q12}, [r8]!
2311
2312 vld1.64 {q8}, [r0,:128] @ next round tweak
2313 b .Lxts_dec_done
2314.align 4
2315.Lxts_dec_6:
2316 vst1.64 {q14}, [r0,:128] @ next round tweak
2317
2318 veor q4, q4, q12
2319#ifndef BSAES_ASM_EXTENDED_KEY
2320 add r4, sp, #0x90 @ pass key schedule
2321#else
2322 add r4, r10, #248 @ pass key schedule
2323#endif
2324 veor q5, q5, q13
2325 mov r5, r1 @ pass rounds
2326 mov r0, sp
2327
2328 bl _bsaes_decrypt8
2329
2330 vld1.64 {q8-q9}, [r0,:128]!
2331 vld1.64 {q10-q11}, [r0,:128]!
2332 veor q0, q0, q8
2333 vld1.64 {q12-q13}, [r0,:128]!
2334 veor q1, q1, q9
2335 veor q8, q6, q10
2336 vst1.8 {q0-q1}, [r8]!
2337 veor q9, q4, q11
2338 veor q10, q2, q12
2339 vst1.8 {q8-q9}, [r8]!
2340 veor q11, q7, q13
2341 vst1.8 {q10-q11}, [r8]!
2342
2343 vld1.64 {q8}, [r0,:128] @ next round tweak
2344 b .Lxts_dec_done
2345.align 4
2346.Lxts_dec_5:
2347 vst1.64 {q13}, [r0,:128] @ next round tweak
2348
2349 veor q3, q3, q11
2350#ifndef BSAES_ASM_EXTENDED_KEY
2351 add r4, sp, #0x90 @ pass key schedule
2352#else
2353 add r4, r10, #248 @ pass key schedule
2354#endif
2355 veor q4, q4, q12
2356 mov r5, r1 @ pass rounds
2357 mov r0, sp
2358
2359 bl _bsaes_decrypt8
2360
2361 vld1.64 {q8-q9}, [r0,:128]!
2362 vld1.64 {q10-q11}, [r0,:128]!
2363 veor q0, q0, q8
2364 vld1.64 {q12}, [r0,:128]!
2365 veor q1, q1, q9
2366 veor q8, q6, q10
2367 vst1.8 {q0-q1}, [r8]!
2368 veor q9, q4, q11
2369 veor q10, q2, q12
2370 vst1.8 {q8-q9}, [r8]!
2371 vst1.8 {q10}, [r8]!
2372
2373 vld1.64 {q8}, [r0,:128] @ next round tweak
2374 b .Lxts_dec_done
2375.align 4
2376.Lxts_dec_4:
2377 vst1.64 {q12}, [r0,:128] @ next round tweak
2378
2379 veor q2, q2, q10
2380#ifndef BSAES_ASM_EXTENDED_KEY
2381 add r4, sp, #0x90 @ pass key schedule
2382#else
2383 add r4, r10, #248 @ pass key schedule
2384#endif
2385 veor q3, q3, q11
2386 mov r5, r1 @ pass rounds
2387 mov r0, sp
2388
2389 bl _bsaes_decrypt8
2390
2391 vld1.64 {q8-q9}, [r0,:128]!
2392 vld1.64 {q10-q11}, [r0,:128]!
2393 veor q0, q0, q8
2394 veor q1, q1, q9
2395 veor q8, q6, q10
2396 vst1.8 {q0-q1}, [r8]!
2397 veor q9, q4, q11
2398 vst1.8 {q8-q9}, [r8]!
2399
2400 vld1.64 {q8}, [r0,:128] @ next round tweak
2401 b .Lxts_dec_done
2402.align 4
2403.Lxts_dec_3:
2404 vst1.64 {q11}, [r0,:128] @ next round tweak
2405
2406 veor q1, q1, q9
2407#ifndef BSAES_ASM_EXTENDED_KEY
2408 add r4, sp, #0x90 @ pass key schedule
2409#else
2410 add r4, r10, #248 @ pass key schedule
2411#endif
2412 veor q2, q2, q10
2413 mov r5, r1 @ pass rounds
2414 mov r0, sp
2415
2416 bl _bsaes_decrypt8
2417
2418 vld1.64 {q8-q9}, [r0,:128]!
2419 vld1.64 {q10}, [r0,:128]!
2420 veor q0, q0, q8
2421 veor q1, q1, q9
2422 veor q8, q6, q10
2423 vst1.8 {q0-q1}, [r8]!
2424 vst1.8 {q8}, [r8]!
2425
2426 vld1.64 {q8}, [r0,:128] @ next round tweak
2427 b .Lxts_dec_done
2428.align 4
2429.Lxts_dec_2:
2430 vst1.64 {q10}, [r0,:128] @ next round tweak
2431
2432 veor q0, q0, q8
2433#ifndef BSAES_ASM_EXTENDED_KEY
2434 add r4, sp, #0x90 @ pass key schedule
2435#else
2436 add r4, r10, #248 @ pass key schedule
2437#endif
2438 veor q1, q1, q9
2439 mov r5, r1 @ pass rounds
2440 mov r0, sp
2441
2442 bl _bsaes_decrypt8
2443
2444 vld1.64 {q8-q9}, [r0,:128]!
2445 veor q0, q0, q8
2446 veor q1, q1, q9
2447 vst1.8 {q0-q1}, [r8]!
2448
2449 vld1.64 {q8}, [r0,:128] @ next round tweak
2450 b .Lxts_dec_done
2451.align 4
2452.Lxts_dec_1:
2453 mov r0, sp
2454 veor q0, q8
2455 mov r1, sp
2456 vst1.8 {q0}, [sp,:128]
2457 mov r2, r10
2458 mov r4, r3 @ preserve fp
2459 mov r5, r2 @ preserve magic
2460
2461 bl AES_decrypt
2462
2463 vld1.8 {q0}, [sp,:128]
2464 veor q0, q0, q8
2465 vst1.8 {q0}, [r8]!
2466 mov r3, r4
2467 mov r2, r5
2468
2469 vmov q8, q9 @ next round tweak
2470
2471.Lxts_dec_done:
2472#ifndef XTS_CHAIN_TWEAK
2473 adds r9, #0x10
2474 beq .Lxts_dec_ret
2475
2476 @ calculate one round of extra tweak for the stolen ciphertext
2477 vldmia r2, {q5}
2478 vshr.s64 q6, q8, #63
2479 vand q6, q6, q5
2480 vadd.u64 q9, q8, q8
2481 vswp d13,d12
2482 veor q9, q9, q6
2483
2484 @ perform the final decryption with the last tweak value
2485 vld1.8 {q0}, [r7]!
2486 mov r0, sp
2487 veor q0, q0, q9
2488 mov r1, sp
2489 vst1.8 {q0}, [sp,:128]
2490 mov r2, r10
2491 mov r4, r3 @ preserve fp
2492
2493 bl AES_decrypt
2494
2495 vld1.8 {q0}, [sp,:128]
2496 veor q0, q0, q9
2497 vst1.8 {q0}, [r8]
2498
2499 mov r6, r8
2500.Lxts_dec_steal:
2501 ldrb r1, [r8]
2502 ldrb r0, [r7], #1
2503 strb r1, [r8, #0x10]
2504 strb r0, [r8], #1
2505
2506 subs r9, #1
2507 bhi .Lxts_dec_steal
2508
2509 vld1.8 {q0}, [r6]
2510 mov r0, sp
2511 veor q0, q8
2512 mov r1, sp
2513 vst1.8 {q0}, [sp,:128]
2514 mov r2, r10
2515
2516 bl AES_decrypt
2517
2518 vld1.8 {q0}, [sp,:128]
2519 veor q0, q0, q8
2520 vst1.8 {q0}, [r6]
2521 mov r3, r4
2522#endif
2523
2524.Lxts_dec_ret:
2525 bic r0, r3, #0xf
2526 vmov.i32 q0, #0
2527 vmov.i32 q1, #0
2528#ifdef XTS_CHAIN_TWEAK
2529 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2530#endif
2531.Lxts_dec_bzero: @ wipe key schedule [if any]
2532 vstmia sp!, {q0-q1}
2533 cmp sp, r0
2534 bne .Lxts_dec_bzero
2535
2536 mov sp, r3
2537#ifdef XTS_CHAIN_TWEAK
2538 vst1.8 {q8}, [r1]
2539#endif
2540 VFP_ABI_POP
2541 ldmia sp!, {r4-r10, pc} @ return
2542
2543.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2544#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
new file mode 100644
index 000000000000..4522366da759
--- /dev/null
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -0,0 +1,434 @@
1/*
2 * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <asm/neon.h>
12#include <crypto/aes.h>
13#include <crypto/ablk_helper.h>
14#include <crypto/algapi.h>
15#include <linux/module.h>
16
17#include "aes_glue.h"
18
19#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
20
21struct BS_KEY {
22 struct AES_KEY rk;
23 int converted;
24 u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
25} __aligned(8);
26
27asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
28asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
29
30asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
31 struct BS_KEY *key, u8 iv[]);
32
33asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
34 struct BS_KEY *key, u8 const iv[]);
35
36asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
37 struct BS_KEY *key, u8 tweak[]);
38
39asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
40 struct BS_KEY *key, u8 tweak[]);
41
42struct aesbs_cbc_ctx {
43 struct AES_KEY enc;
44 struct BS_KEY dec;
45};
46
47struct aesbs_ctr_ctx {
48 struct BS_KEY enc;
49};
50
51struct aesbs_xts_ctx {
52 struct BS_KEY enc;
53 struct BS_KEY dec;
54 struct AES_KEY twkey;
55};
56
57static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
58 unsigned int key_len)
59{
60 struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
61 int bits = key_len * 8;
62
63 if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
64 tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
65 return -EINVAL;
66 }
67 ctx->dec.rk = ctx->enc;
68 private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
69 ctx->dec.converted = 0;
70 return 0;
71}
72
73static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
74 unsigned int key_len)
75{
76 struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
77 int bits = key_len * 8;
78
79 if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
80 tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
81 return -EINVAL;
82 }
83 ctx->enc.converted = 0;
84 return 0;
85}
86
87static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
88 unsigned int key_len)
89{
90 struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
91 int bits = key_len * 4;
92
93 if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
94 tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
95 return -EINVAL;
96 }
97 ctx->dec.rk = ctx->enc.rk;
98 private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
99 private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
100 ctx->enc.converted = ctx->dec.converted = 0;
101 return 0;
102}
103
104static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
105 struct scatterlist *dst,
106 struct scatterlist *src, unsigned int nbytes)
107{
108 struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
109 struct blkcipher_walk walk;
110 int err;
111
112 blkcipher_walk_init(&walk, dst, src, nbytes);
113 err = blkcipher_walk_virt(desc, &walk);
114
115 while (walk.nbytes) {
116 u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
117 u8 *src = walk.src.virt.addr;
118
119 if (walk.dst.virt.addr == walk.src.virt.addr) {
120 u8 *iv = walk.iv;
121
122 do {
123 crypto_xor(src, iv, AES_BLOCK_SIZE);
124 AES_encrypt(src, src, &ctx->enc);
125 iv = src;
126 src += AES_BLOCK_SIZE;
127 } while (--blocks);
128 memcpy(walk.iv, iv, AES_BLOCK_SIZE);
129 } else {
130 u8 *dst = walk.dst.virt.addr;
131
132 do {
133 crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
134 AES_encrypt(walk.iv, dst, &ctx->enc);
135 memcpy(walk.iv, dst, AES_BLOCK_SIZE);
136 src += AES_BLOCK_SIZE;
137 dst += AES_BLOCK_SIZE;
138 } while (--blocks);
139 }
140 err = blkcipher_walk_done(desc, &walk, 0);
141 }
142 return err;
143}
144
145static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
146 struct scatterlist *dst,
147 struct scatterlist *src, unsigned int nbytes)
148{
149 struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
150 struct blkcipher_walk walk;
151 int err;
152
153 blkcipher_walk_init(&walk, dst, src, nbytes);
154 err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
155
156 while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
157 kernel_neon_begin();
158 bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
159 walk.nbytes, &ctx->dec, walk.iv);
160 kernel_neon_end();
161 err = blkcipher_walk_done(desc, &walk, 0);
162 }
163 while (walk.nbytes) {
164 u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
165 u8 *dst = walk.dst.virt.addr;
166 u8 *src = walk.src.virt.addr;
167 u8 bk[2][AES_BLOCK_SIZE];
168 u8 *iv = walk.iv;
169
170 do {
171 if (walk.dst.virt.addr == walk.src.virt.addr)
172 memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
173
174 AES_decrypt(src, dst, &ctx->dec.rk);
175 crypto_xor(dst, iv, AES_BLOCK_SIZE);
176
177 if (walk.dst.virt.addr == walk.src.virt.addr)
178 iv = bk[blocks & 1];
179 else
180 iv = src;
181
182 dst += AES_BLOCK_SIZE;
183 src += AES_BLOCK_SIZE;
184 } while (--blocks);
185 err = blkcipher_walk_done(desc, &walk, 0);
186 }
187 return err;
188}
189
190static void inc_be128_ctr(__be32 ctr[], u32 addend)
191{
192 int i;
193
194 for (i = 3; i >= 0; i--, addend = 1) {
195 u32 n = be32_to_cpu(ctr[i]) + addend;
196
197 ctr[i] = cpu_to_be32(n);
198 if (n >= addend)
199 break;
200 }
201}
202
203static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
204 struct scatterlist *dst, struct scatterlist *src,
205 unsigned int nbytes)
206{
207 struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
208 struct blkcipher_walk walk;
209 u32 blocks;
210 int err;
211
212 blkcipher_walk_init(&walk, dst, src, nbytes);
213 err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
214
215 while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
216 u32 tail = walk.nbytes % AES_BLOCK_SIZE;
217 __be32 *ctr = (__be32 *)walk.iv;
218 u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
219
220 /* avoid 32 bit counter overflow in the NEON code */
221 if (unlikely(headroom < blocks)) {
222 blocks = headroom + 1;
223 tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
224 }
225 kernel_neon_begin();
226 bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
227 walk.dst.virt.addr, blocks,
228 &ctx->enc, walk.iv);
229 kernel_neon_end();
230 inc_be128_ctr(ctr, blocks);
231
232 nbytes -= blocks * AES_BLOCK_SIZE;
233 if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
234 break;
235
236 err = blkcipher_walk_done(desc, &walk, tail);
237 }
238 if (walk.nbytes) {
239 u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
240 u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
241 u8 ks[AES_BLOCK_SIZE];
242
243 AES_encrypt(walk.iv, ks, &ctx->enc.rk);
244 if (tdst != tsrc)
245 memcpy(tdst, tsrc, nbytes);
246 crypto_xor(tdst, ks, nbytes);
247 err = blkcipher_walk_done(desc, &walk, 0);
248 }
249 return err;
250}
251
252static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
253 struct scatterlist *dst,
254 struct scatterlist *src, unsigned int nbytes)
255{
256 struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
257 struct blkcipher_walk walk;
258 int err;
259
260 blkcipher_walk_init(&walk, dst, src, nbytes);
261 err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
262
263 /* generate the initial tweak */
264 AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
265
266 while (walk.nbytes) {
267 kernel_neon_begin();
268 bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
269 walk.nbytes, &ctx->enc, walk.iv);
270 kernel_neon_end();
271 err = blkcipher_walk_done(desc, &walk, 0);
272 }
273 return err;
274}
275
276static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
277 struct scatterlist *dst,
278 struct scatterlist *src, unsigned int nbytes)
279{
280 struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
281 struct blkcipher_walk walk;
282 int err;
283
284 blkcipher_walk_init(&walk, dst, src, nbytes);
285 err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
286
287 /* generate the initial tweak */
288 AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
289
290 while (walk.nbytes) {
291 kernel_neon_begin();
292 bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
293 walk.nbytes, &ctx->dec, walk.iv);
294 kernel_neon_end();
295 err = blkcipher_walk_done(desc, &walk, 0);
296 }
297 return err;
298}
299
300static struct crypto_alg aesbs_algs[] = { {
301 .cra_name = "__cbc-aes-neonbs",
302 .cra_driver_name = "__driver-cbc-aes-neonbs",
303 .cra_priority = 0,
304 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
305 .cra_blocksize = AES_BLOCK_SIZE,
306 .cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
307 .cra_alignmask = 7,
308 .cra_type = &crypto_blkcipher_type,
309 .cra_module = THIS_MODULE,
310 .cra_blkcipher = {
311 .min_keysize = AES_MIN_KEY_SIZE,
312 .max_keysize = AES_MAX_KEY_SIZE,
313 .ivsize = AES_BLOCK_SIZE,
314 .setkey = aesbs_cbc_set_key,
315 .encrypt = aesbs_cbc_encrypt,
316 .decrypt = aesbs_cbc_decrypt,
317 },
318}, {
319 .cra_name = "__ctr-aes-neonbs",
320 .cra_driver_name = "__driver-ctr-aes-neonbs",
321 .cra_priority = 0,
322 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
323 .cra_blocksize = 1,
324 .cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
325 .cra_alignmask = 7,
326 .cra_type = &crypto_blkcipher_type,
327 .cra_module = THIS_MODULE,
328 .cra_blkcipher = {
329 .min_keysize = AES_MIN_KEY_SIZE,
330 .max_keysize = AES_MAX_KEY_SIZE,
331 .ivsize = AES_BLOCK_SIZE,
332 .setkey = aesbs_ctr_set_key,
333 .encrypt = aesbs_ctr_encrypt,
334 .decrypt = aesbs_ctr_encrypt,
335 },
336}, {
337 .cra_name = "__xts-aes-neonbs",
338 .cra_driver_name = "__driver-xts-aes-neonbs",
339 .cra_priority = 0,
340 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
341 .cra_blocksize = AES_BLOCK_SIZE,
342 .cra_ctxsize = sizeof(struct aesbs_xts_ctx),
343 .cra_alignmask = 7,
344 .cra_type = &crypto_blkcipher_type,
345 .cra_module = THIS_MODULE,
346 .cra_blkcipher = {
347 .min_keysize = 2 * AES_MIN_KEY_SIZE,
348 .max_keysize = 2 * AES_MAX_KEY_SIZE,
349 .ivsize = AES_BLOCK_SIZE,
350 .setkey = aesbs_xts_set_key,
351 .encrypt = aesbs_xts_encrypt,
352 .decrypt = aesbs_xts_decrypt,
353 },
354}, {
355 .cra_name = "cbc(aes)",
356 .cra_driver_name = "cbc-aes-neonbs",
357 .cra_priority = 300,
358 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
359 .cra_blocksize = AES_BLOCK_SIZE,
360 .cra_ctxsize = sizeof(struct async_helper_ctx),
361 .cra_alignmask = 7,
362 .cra_type = &crypto_ablkcipher_type,
363 .cra_module = THIS_MODULE,
364 .cra_init = ablk_init,
365 .cra_exit = ablk_exit,
366 .cra_ablkcipher = {
367 .min_keysize = AES_MIN_KEY_SIZE,
368 .max_keysize = AES_MAX_KEY_SIZE,
369 .ivsize = AES_BLOCK_SIZE,
370 .setkey = ablk_set_key,
371 .encrypt = __ablk_encrypt,
372 .decrypt = ablk_decrypt,
373 }
374}, {
375 .cra_name = "ctr(aes)",
376 .cra_driver_name = "ctr-aes-neonbs",
377 .cra_priority = 300,
378 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
379 .cra_blocksize = 1,
380 .cra_ctxsize = sizeof(struct async_helper_ctx),
381 .cra_alignmask = 7,
382 .cra_type = &crypto_ablkcipher_type,
383 .cra_module = THIS_MODULE,
384 .cra_init = ablk_init,
385 .cra_exit = ablk_exit,
386 .cra_ablkcipher = {
387 .min_keysize = AES_MIN_KEY_SIZE,
388 .max_keysize = AES_MAX_KEY_SIZE,
389 .ivsize = AES_BLOCK_SIZE,
390 .setkey = ablk_set_key,
391 .encrypt = ablk_encrypt,
392 .decrypt = ablk_decrypt,
393 }
394}, {
395 .cra_name = "xts(aes)",
396 .cra_driver_name = "xts-aes-neonbs",
397 .cra_priority = 300,
398 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
399 .cra_blocksize = AES_BLOCK_SIZE,
400 .cra_ctxsize = sizeof(struct async_helper_ctx),
401 .cra_alignmask = 7,
402 .cra_type = &crypto_ablkcipher_type,
403 .cra_module = THIS_MODULE,
404 .cra_init = ablk_init,
405 .cra_exit = ablk_exit,
406 .cra_ablkcipher = {
407 .min_keysize = 2 * AES_MIN_KEY_SIZE,
408 .max_keysize = 2 * AES_MAX_KEY_SIZE,
409 .ivsize = AES_BLOCK_SIZE,
410 .setkey = ablk_set_key,
411 .encrypt = ablk_encrypt,
412 .decrypt = ablk_decrypt,
413 }
414} };
415
416static int __init aesbs_mod_init(void)
417{
418 if (!cpu_has_neon())
419 return -ENODEV;
420
421 return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
422}
423
424static void __exit aesbs_mod_exit(void)
425{
426 crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
427}
428
429module_init(aesbs_mod_init);
430module_exit(aesbs_mod_exit);
431
432MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
433MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
434MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
new file mode 100644
index 000000000000..f3d96d932573
--- /dev/null
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
10# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11# granted.
12# ====================================================================
13
14# Bit-sliced AES for ARM NEON
15#
16# February 2012.
17#
18# This implementation is direct adaptation of bsaes-x86_64 module for
19# ARM NEON. Except that this module is endian-neutral [in sense that
20# it can be compiled for either endianness] by courtesy of vld1.8's
21# neutrality. Initial version doesn't implement interface to OpenSSL,
22# only low-level primitives and unsupported entry points, just enough
23# to collect performance results, which for Cortex-A8 core are:
24#
25# encrypt 19.5 cycles per byte processed with 128-bit key
26# decrypt 22.1 cycles per byte processed with 128-bit key
27# key conv. 440 cycles per 128-bit key/0.18 of 8x block
28#
29# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
30# which is [much] worse than anticipated (for further details see
31# http://www.openssl.org/~appro/Snapdragon-S4.html).
32#
33# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
34# manages in 20.0 cycles].
35#
36# When comparing to x86_64 results keep in mind that NEON unit is
37# [mostly] single-issue and thus can't [fully] benefit from
38# instruction-level parallelism. And when comparing to aes-armv4
39# results keep in mind key schedule conversion overhead (see
40# bsaes-x86_64.pl for further details)...
41#
42# <appro@openssl.org>
43
44# April-August 2013
45#
46# Add CBC, CTR and XTS subroutines, adapt for kernel use.
47#
48# <ard.biesheuvel@linaro.org>
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
54my @XMM=map("q$_",(0..15));
55
56{
57my ($key,$rounds,$const)=("r4","r5","r6");
58
59sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
60sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
61
62sub Sbox {
63# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
64# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
65my @b=@_[0..7];
66my @t=@_[8..11];
67my @s=@_[12..15];
68 &InBasisChange (@b);
69 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
70 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
71}
72
73sub InBasisChange {
74# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
75# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
76my @b=@_[0..7];
77$code.=<<___;
78 veor @b[2], @b[2], @b[1]
79 veor @b[5], @b[5], @b[6]
80 veor @b[3], @b[3], @b[0]
81 veor @b[6], @b[6], @b[2]
82 veor @b[5], @b[5], @b[0]
83
84 veor @b[6], @b[6], @b[3]
85 veor @b[3], @b[3], @b[7]
86 veor @b[7], @b[7], @b[5]
87 veor @b[3], @b[3], @b[4]
88 veor @b[4], @b[4], @b[5]
89
90 veor @b[2], @b[2], @b[7]
91 veor @b[3], @b[3], @b[1]
92 veor @b[1], @b[1], @b[5]
93___
94}
95
96sub OutBasisChange {
97# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
98# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
99my @b=@_[0..7];
100$code.=<<___;
101 veor @b[0], @b[0], @b[6]
102 veor @b[1], @b[1], @b[4]
103 veor @b[4], @b[4], @b[6]
104 veor @b[2], @b[2], @b[0]
105 veor @b[6], @b[6], @b[1]
106
107 veor @b[1], @b[1], @b[5]
108 veor @b[5], @b[5], @b[3]
109 veor @b[3], @b[3], @b[7]
110 veor @b[7], @b[7], @b[5]
111 veor @b[2], @b[2], @b[5]
112
113 veor @b[4], @b[4], @b[7]
114___
115}
116
117sub InvSbox {
118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123 &InvInBasisChange (@b);
124 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
125 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
126}
127
128sub InvInBasisChange { # OutBasisChange in reverse (with twist)
129my @b=@_[5,1,2,6,3,7,0,4];
130$code.=<<___
131 veor @b[1], @b[1], @b[7]
132 veor @b[4], @b[4], @b[7]
133
134 veor @b[7], @b[7], @b[5]
135 veor @b[1], @b[1], @b[3]
136 veor @b[2], @b[2], @b[5]
137 veor @b[3], @b[3], @b[7]
138
139 veor @b[6], @b[6], @b[1]
140 veor @b[2], @b[2], @b[0]
141 veor @b[5], @b[5], @b[3]
142 veor @b[4], @b[4], @b[6]
143 veor @b[0], @b[0], @b[6]
144 veor @b[1], @b[1], @b[4]
145___
146}
147
148sub InvOutBasisChange { # InBasisChange in reverse
149my @b=@_[2,5,7,3,6,1,0,4];
150$code.=<<___;
151 veor @b[1], @b[1], @b[5]
152 veor @b[2], @b[2], @b[7]
153
154 veor @b[3], @b[3], @b[1]
155 veor @b[4], @b[4], @b[5]
156 veor @b[7], @b[7], @b[5]
157 veor @b[3], @b[3], @b[4]
158 veor @b[5], @b[5], @b[0]
159 veor @b[3], @b[3], @b[7]
160 veor @b[6], @b[6], @b[2]
161 veor @b[2], @b[2], @b[1]
162 veor @b[6], @b[6], @b[3]
163
164 veor @b[3], @b[3], @b[0]
165 veor @b[5], @b[5], @b[6]
166___
167}
168
169sub Mul_GF4 {
170#;*************************************************************
171#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
172#;*************************************************************
173my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
174$code.=<<___;
175 veor $t0, $y0, $y1
176 vand $t0, $t0, $x0
177 veor $x0, $x0, $x1
178 vand $t1, $x1, $y0
179 vand $x0, $x0, $y1
180 veor $x1, $t1, $t0
181 veor $x0, $x0, $t1
182___
183}
184
185sub Mul_GF4_N { # not used, see next subroutine
186# multiply and scale by N
187my ($x0,$x1,$y0,$y1,$t0)=@_;
188$code.=<<___;
189 veor $t0, $y0, $y1
190 vand $t0, $t0, $x0
191 veor $x0, $x0, $x1
192 vand $x1, $x1, $y0
193 vand $x0, $x0, $y1
194 veor $x1, $x1, $x0
195 veor $x0, $x0, $t0
196___
197}
198
199sub Mul_GF4_N_GF4 {
200# interleaved Mul_GF4_N and Mul_GF4
201my ($x0,$x1,$y0,$y1,$t0,
202 $x2,$x3,$y2,$y3,$t1)=@_;
203$code.=<<___;
204 veor $t0, $y0, $y1
205 veor $t1, $y2, $y3
206 vand $t0, $t0, $x0
207 vand $t1, $t1, $x2
208 veor $x0, $x0, $x1
209 veor $x2, $x2, $x3
210 vand $x1, $x1, $y0
211 vand $x3, $x3, $y2
212 vand $x0, $x0, $y1
213 vand $x2, $x2, $y3
214 veor $x1, $x1, $x0
215 veor $x2, $x2, $x3
216 veor $x0, $x0, $t0
217 veor $x3, $x3, $t1
218___
219}
220sub Mul_GF16_2 {
221my @x=@_[0..7];
222my @y=@_[8..11];
223my @t=@_[12..15];
224$code.=<<___;
225 veor @t[0], @x[0], @x[2]
226 veor @t[1], @x[1], @x[3]
227___
228 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
229$code.=<<___;
230 veor @y[0], @y[0], @y[2]
231 veor @y[1], @y[1], @y[3]
232___
233 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
234 @x[2], @x[3], @y[2], @y[3], @t[2]);
235$code.=<<___;
236 veor @x[0], @x[0], @t[0]
237 veor @x[2], @x[2], @t[0]
238 veor @x[1], @x[1], @t[1]
239 veor @x[3], @x[3], @t[1]
240
241 veor @t[0], @x[4], @x[6]
242 veor @t[1], @x[5], @x[7]
243___
244 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
245 @x[6], @x[7], @y[2], @y[3], @t[2]);
246$code.=<<___;
247 veor @y[0], @y[0], @y[2]
248 veor @y[1], @y[1], @y[3]
249___
250 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
251$code.=<<___;
252 veor @x[4], @x[4], @t[0]
253 veor @x[6], @x[6], @t[0]
254 veor @x[5], @x[5], @t[1]
255 veor @x[7], @x[7], @t[1]
256___
257}
258sub Inv_GF256 {
259#;********************************************************************
260#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
261#;********************************************************************
262my @x=@_[0..7];
263my @t=@_[8..11];
264my @s=@_[12..15];
265# direct optimizations from hardware
266$code.=<<___;
267 veor @t[3], @x[4], @x[6]
268 veor @t[2], @x[5], @x[7]
269 veor @t[1], @x[1], @x[3]
270 veor @s[1], @x[7], @x[6]
271 vmov @t[0], @t[2]
272 veor @s[0], @x[0], @x[2]
273
274 vorr @t[2], @t[2], @t[1]
275 veor @s[3], @t[3], @t[0]
276 vand @s[2], @t[3], @s[0]
277 vorr @t[3], @t[3], @s[0]
278 veor @s[0], @s[0], @t[1]
279 vand @t[0], @t[0], @t[1]
280 veor @t[1], @x[3], @x[2]
281 vand @s[3], @s[3], @s[0]
282 vand @s[1], @s[1], @t[1]
283 veor @t[1], @x[4], @x[5]
284 veor @s[0], @x[1], @x[0]
285 veor @t[3], @t[3], @s[1]
286 veor @t[2], @t[2], @s[1]
287 vand @s[1], @t[1], @s[0]
288 vorr @t[1], @t[1], @s[0]
289 veor @t[3], @t[3], @s[3]
290 veor @t[0], @t[0], @s[1]
291 veor @t[2], @t[2], @s[2]
292 veor @t[1], @t[1], @s[3]
293 veor @t[0], @t[0], @s[2]
294 vand @s[0], @x[7], @x[3]
295 veor @t[1], @t[1], @s[2]
296 vand @s[1], @x[6], @x[2]
297 vand @s[2], @x[5], @x[1]
298 vorr @s[3], @x[4], @x[0]
299 veor @t[3], @t[3], @s[0]
300 veor @t[1], @t[1], @s[2]
301 veor @t[0], @t[0], @s[3]
302 veor @t[2], @t[2], @s[1]
303
304 @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
305
306 @ new smaller inversion
307
308 vand @s[2], @t[3], @t[1]
309 vmov @s[0], @t[0]
310
311 veor @s[1], @t[2], @s[2]
312 veor @s[3], @t[0], @s[2]
313 veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
314
315 vbsl @s[1], @t[1], @t[0]
316 vbsl @s[3], @t[3], @t[2]
317 veor @t[3], @t[3], @t[2]
318
319 vbsl @s[0], @s[1], @s[2]
320 vbsl @t[0], @s[2], @s[1]
321
322 vand @s[2], @s[0], @s[3]
323 veor @t[1], @t[1], @t[0]
324
325 veor @s[2], @s[2], @t[3]
326___
327# output in s3, s2, s1, t1
328
329# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
330
331# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
332 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
333
334### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
335}
336
337# AES linear components
338
339sub ShiftRows {
340my @x=@_[0..7];
341my @t=@_[8..11];
342my $mask=pop;
343$code.=<<___;
344 vldmia $key!, {@t[0]-@t[3]}
345 veor @t[0], @t[0], @x[0]
346 veor @t[1], @t[1], @x[1]
347 vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
348 vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
349 vldmia $key!, {@t[0]}
350 veor @t[2], @t[2], @x[2]
351 vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
352 vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
353 vldmia $key!, {@t[1]}
354 veor @t[3], @t[3], @x[3]
355 vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
356 vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
357 vldmia $key!, {@t[2]}
358 vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
359 vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
360 vldmia $key!, {@t[3]}
361 veor @t[0], @t[0], @x[4]
362 veor @t[1], @t[1], @x[5]
363 vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
364 vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
365 veor @t[2], @t[2], @x[6]
366 vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
367 vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
368 veor @t[3], @t[3], @x[7]
369 vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
370 vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
371 vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
372 vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
373___
374}
375
376sub MixColumns {
377# modified to emit output in order suitable for feeding back to aesenc[last]
378my @x=@_[0..7];
379my @t=@_[8..15];
380my $inv=@_[16]; # optional
381$code.=<<___;
382 vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
383 vext.8 @t[1], @x[1], @x[1], #12
384 veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
385 vext.8 @t[2], @x[2], @x[2], #12
386 veor @x[1], @x[1], @t[1]
387 vext.8 @t[3], @x[3], @x[3], #12
388 veor @x[2], @x[2], @t[2]
389 vext.8 @t[4], @x[4], @x[4], #12
390 veor @x[3], @x[3], @t[3]
391 vext.8 @t[5], @x[5], @x[5], #12
392 veor @x[4], @x[4], @t[4]
393 vext.8 @t[6], @x[6], @x[6], #12
394 veor @x[5], @x[5], @t[5]
395 vext.8 @t[7], @x[7], @x[7], #12
396 veor @x[6], @x[6], @t[6]
397
398 veor @t[1], @t[1], @x[0]
399 veor @x[7], @x[7], @t[7]
400 vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
401 veor @t[2], @t[2], @x[1]
402 veor @t[0], @t[0], @x[7]
403 veor @t[1], @t[1], @x[7]
404 vext.8 @x[1], @x[1], @x[1], #8
405 veor @t[5], @t[5], @x[4]
406 veor @x[0], @x[0], @t[0]
407 veor @t[6], @t[6], @x[5]
408 veor @x[1], @x[1], @t[1]
409 vext.8 @t[0], @x[4], @x[4], #8
410 veor @t[4], @t[4], @x[3]
411 vext.8 @t[1], @x[5], @x[5], #8
412 veor @t[7], @t[7], @x[6]
413 vext.8 @x[4], @x[3], @x[3], #8
414 veor @t[3], @t[3], @x[2]
415 vext.8 @x[5], @x[7], @x[7], #8
416 veor @t[4], @t[4], @x[7]
417 vext.8 @x[3], @x[6], @x[6], #8
418 veor @t[3], @t[3], @x[7]
419 vext.8 @x[6], @x[2], @x[2], #8
420 veor @x[7], @t[1], @t[5]
421___
422$code.=<<___ if (!$inv);
423 veor @x[2], @t[0], @t[4]
424 veor @x[4], @x[4], @t[3]
425 veor @x[5], @x[5], @t[7]
426 veor @x[3], @x[3], @t[6]
427 @ vmov @x[2], @t[0]
428 veor @x[6], @x[6], @t[2]
429 @ vmov @x[7], @t[1]
430___
431$code.=<<___ if ($inv);
432 veor @t[3], @t[3], @x[4]
433 veor @x[5], @x[5], @t[7]
434 veor @x[2], @x[3], @t[6]
435 veor @x[3], @t[0], @t[4]
436 veor @x[4], @x[6], @t[2]
437 vmov @x[6], @t[3]
438 @ vmov @x[7], @t[1]
439___
440}
441
442sub InvMixColumns_orig {
443my @x=@_[0..7];
444my @t=@_[8..15];
445
446$code.=<<___;
447 @ multiplication by 0x0e
448 vext.8 @t[7], @x[7], @x[7], #12
449 vmov @t[2], @x[2]
450 veor @x[2], @x[2], @x[5] @ 2 5
451 veor @x[7], @x[7], @x[5] @ 7 5
452 vext.8 @t[0], @x[0], @x[0], #12
453 vmov @t[5], @x[5]
454 veor @x[5], @x[5], @x[0] @ 5 0 [1]
455 veor @x[0], @x[0], @x[1] @ 0 1
456 vext.8 @t[1], @x[1], @x[1], #12
457 veor @x[1], @x[1], @x[2] @ 1 25
458 veor @x[0], @x[0], @x[6] @ 01 6 [2]
459 vext.8 @t[3], @x[3], @x[3], #12
460 veor @x[1], @x[1], @x[3] @ 125 3 [4]
461 veor @x[2], @x[2], @x[0] @ 25 016 [3]
462 veor @x[3], @x[3], @x[7] @ 3 75
463 veor @x[7], @x[7], @x[6] @ 75 6 [0]
464 vext.8 @t[6], @x[6], @x[6], #12
465 vmov @t[4], @x[4]
466 veor @x[6], @x[6], @x[4] @ 6 4
467 veor @x[4], @x[4], @x[3] @ 4 375 [6]
468 veor @x[3], @x[3], @x[7] @ 375 756=36
469 veor @x[6], @x[6], @t[5] @ 64 5 [7]
470 veor @x[3], @x[3], @t[2] @ 36 2
471 vext.8 @t[5], @t[5], @t[5], #12
472 veor @x[3], @x[3], @t[4] @ 362 4 [5]
473___
474 my @y = @x[7,5,0,2,1,3,4,6];
475$code.=<<___;
476 @ multiplication by 0x0b
477 veor @y[1], @y[1], @y[0]
478 veor @y[0], @y[0], @t[0]
479 vext.8 @t[2], @t[2], @t[2], #12
480 veor @y[1], @y[1], @t[1]
481 veor @y[0], @y[0], @t[5]
482 vext.8 @t[4], @t[4], @t[4], #12
483 veor @y[1], @y[1], @t[6]
484 veor @y[0], @y[0], @t[7]
485 veor @t[7], @t[7], @t[6] @ clobber t[7]
486
487 veor @y[3], @y[3], @t[0]
488 veor @y[1], @y[1], @y[0]
489 vext.8 @t[0], @t[0], @t[0], #12
490 veor @y[2], @y[2], @t[1]
491 veor @y[4], @y[4], @t[1]
492 vext.8 @t[1], @t[1], @t[1], #12
493 veor @y[2], @y[2], @t[2]
494 veor @y[3], @y[3], @t[2]
495 veor @y[5], @y[5], @t[2]
496 veor @y[2], @y[2], @t[7]
497 vext.8 @t[2], @t[2], @t[2], #12
498 veor @y[3], @y[3], @t[3]
499 veor @y[6], @y[6], @t[3]
500 veor @y[4], @y[4], @t[3]
501 veor @y[7], @y[7], @t[4]
502 vext.8 @t[3], @t[3], @t[3], #12
503 veor @y[5], @y[5], @t[4]
504 veor @y[7], @y[7], @t[7]
505 veor @t[7], @t[7], @t[5] @ clobber t[7] even more
506 veor @y[3], @y[3], @t[5]
507 veor @y[4], @y[4], @t[4]
508
509 veor @y[5], @y[5], @t[7]
510 vext.8 @t[4], @t[4], @t[4], #12
511 veor @y[6], @y[6], @t[7]
512 veor @y[4], @y[4], @t[7]
513
514 veor @t[7], @t[7], @t[5]
515 vext.8 @t[5], @t[5], @t[5], #12
516
517 @ multiplication by 0x0d
518 veor @y[4], @y[4], @y[7]
519 veor @t[7], @t[7], @t[6] @ restore t[7]
520 veor @y[7], @y[7], @t[4]
521 vext.8 @t[6], @t[6], @t[6], #12
522 veor @y[2], @y[2], @t[0]
523 veor @y[7], @y[7], @t[5]
524 vext.8 @t[7], @t[7], @t[7], #12
525 veor @y[2], @y[2], @t[2]
526
527 veor @y[3], @y[3], @y[1]
528 veor @y[1], @y[1], @t[1]
529 veor @y[0], @y[0], @t[0]
530 veor @y[3], @y[3], @t[0]
531 veor @y[1], @y[1], @t[5]
532 veor @y[0], @y[0], @t[5]
533 vext.8 @t[0], @t[0], @t[0], #12
534 veor @y[1], @y[1], @t[7]
535 veor @y[0], @y[0], @t[6]
536 veor @y[3], @y[3], @y[1]
537 veor @y[4], @y[4], @t[1]
538 vext.8 @t[1], @t[1], @t[1], #12
539
540 veor @y[7], @y[7], @t[7]
541 veor @y[4], @y[4], @t[2]
542 veor @y[5], @y[5], @t[2]
543 veor @y[2], @y[2], @t[6]
544 veor @t[6], @t[6], @t[3] @ clobber t[6]
545 vext.8 @t[2], @t[2], @t[2], #12
546 veor @y[4], @y[4], @y[7]
547 veor @y[3], @y[3], @t[6]
548
549 veor @y[6], @y[6], @t[6]
550 veor @y[5], @y[5], @t[5]
551 vext.8 @t[5], @t[5], @t[5], #12
552 veor @y[6], @y[6], @t[4]
553 vext.8 @t[4], @t[4], @t[4], #12
554 veor @y[5], @y[5], @t[6]
555 veor @y[6], @y[6], @t[7]
556 vext.8 @t[7], @t[7], @t[7], #12
557 veor @t[6], @t[6], @t[3] @ restore t[6]
558 vext.8 @t[3], @t[3], @t[3], #12
559
560 @ multiplication by 0x09
561 veor @y[4], @y[4], @y[1]
562 veor @t[1], @t[1], @y[1] @ t[1]=y[1]
563 veor @t[0], @t[0], @t[5] @ clobber t[0]
564 vext.8 @t[6], @t[6], @t[6], #12
565 veor @t[1], @t[1], @t[5]
566 veor @y[3], @y[3], @t[0]
567 veor @t[0], @t[0], @y[0] @ t[0]=y[0]
568 veor @t[1], @t[1], @t[6]
569 veor @t[6], @t[6], @t[7] @ clobber t[6]
570 veor @y[4], @y[4], @t[1]
571 veor @y[7], @y[7], @t[4]
572 veor @y[6], @y[6], @t[3]
573 veor @y[5], @y[5], @t[2]
574 veor @t[4], @t[4], @y[4] @ t[4]=y[4]
575 veor @t[3], @t[3], @y[3] @ t[3]=y[3]
576 veor @t[5], @t[5], @y[5] @ t[5]=y[5]
577 veor @t[2], @t[2], @y[2] @ t[2]=y[2]
578 veor @t[3], @t[3], @t[7]
579 veor @XMM[5], @t[5], @t[6]
580 veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
581 veor @XMM[2], @t[2], @t[6]
582 veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
583
584 vmov @XMM[0], @t[0]
585 vmov @XMM[1], @t[1]
586 @ vmov @XMM[2], @t[2]
587 vmov @XMM[3], @t[3]
588 vmov @XMM[4], @t[4]
589 @ vmov @XMM[5], @t[5]
590 @ vmov @XMM[6], @t[6]
591 @ vmov @XMM[7], @t[7]
592___
593}
594
595sub InvMixColumns {
596my @x=@_[0..7];
597my @t=@_[8..15];
598
599# Thanks to Jussi Kivilinna for providing pointer to
600#
601# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
602# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
603# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
604# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
605
606$code.=<<___;
607 @ multiplication by 0x05-0x00-0x04-0x00
608 vext.8 @t[0], @x[0], @x[0], #8
609 vext.8 @t[6], @x[6], @x[6], #8
610 vext.8 @t[7], @x[7], @x[7], #8
611 veor @t[0], @t[0], @x[0]
612 vext.8 @t[1], @x[1], @x[1], #8
613 veor @t[6], @t[6], @x[6]
614 vext.8 @t[2], @x[2], @x[2], #8
615 veor @t[7], @t[7], @x[7]
616 vext.8 @t[3], @x[3], @x[3], #8
617 veor @t[1], @t[1], @x[1]
618 vext.8 @t[4], @x[4], @x[4], #8
619 veor @t[2], @t[2], @x[2]
620 vext.8 @t[5], @x[5], @x[5], #8
621 veor @t[3], @t[3], @x[3]
622 veor @t[4], @t[4], @x[4]
623 veor @t[5], @t[5], @x[5]
624
625 veor @x[0], @x[0], @t[6]
626 veor @x[1], @x[1], @t[6]
627 veor @x[2], @x[2], @t[0]
628 veor @x[4], @x[4], @t[2]
629 veor @x[3], @x[3], @t[1]
630 veor @x[1], @x[1], @t[7]
631 veor @x[2], @x[2], @t[7]
632 veor @x[4], @x[4], @t[6]
633 veor @x[5], @x[5], @t[3]
634 veor @x[3], @x[3], @t[6]
635 veor @x[6], @x[6], @t[4]
636 veor @x[4], @x[4], @t[7]
637 veor @x[5], @x[5], @t[7]
638 veor @x[7], @x[7], @t[5]
639___
640 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
641}
642
643sub swapmove {
644my ($a,$b,$n,$mask,$t)=@_;
645$code.=<<___;
646 vshr.u64 $t, $b, #$n
647 veor $t, $t, $a
648 vand $t, $t, $mask
649 veor $a, $a, $t
650 vshl.u64 $t, $t, #$n
651 veor $b, $b, $t
652___
653}
654sub swapmove2x {
655my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
656$code.=<<___;
657 vshr.u64 $t0, $b0, #$n
658 vshr.u64 $t1, $b1, #$n
659 veor $t0, $t0, $a0
660 veor $t1, $t1, $a1
661 vand $t0, $t0, $mask
662 vand $t1, $t1, $mask
663 veor $a0, $a0, $t0
664 vshl.u64 $t0, $t0, #$n
665 veor $a1, $a1, $t1
666 vshl.u64 $t1, $t1, #$n
667 veor $b0, $b0, $t0
668 veor $b1, $b1, $t1
669___
670}
671
672sub bitslice {
673my @x=reverse(@_[0..7]);
674my ($t0,$t1,$t2,$t3)=@_[8..11];
675$code.=<<___;
676 vmov.i8 $t0,#0x55 @ compose .LBS0
677 vmov.i8 $t1,#0x33 @ compose .LBS1
678___
679 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
680 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
681$code.=<<___;
682 vmov.i8 $t0,#0x0f @ compose .LBS2
683___
684 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
685 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
686
687 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
688 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
689}
690
691$code.=<<___;
692#ifndef __KERNEL__
693# include "arm_arch.h"
694
695# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
696# define VFP_ABI_POP vldmia sp!,{d8-d15}
697# define VFP_ABI_FRAME 0x40
698#else
699# define VFP_ABI_PUSH
700# define VFP_ABI_POP
701# define VFP_ABI_FRAME 0
702# define BSAES_ASM_EXTENDED_KEY
703# define XTS_CHAIN_TWEAK
704# define __ARM_ARCH__ __LINUX_ARM_ARCH__
705#endif
706
707#ifdef __thumb__
708# define adrl adr
709#endif
710
711#if __ARM_ARCH__>=7
712.text
713.syntax unified @ ARMv7-capable assembler is expected to handle this
714#ifdef __thumb2__
715.thumb
716#else
717.code 32
718#endif
719
720.fpu neon
721
722.type _bsaes_decrypt8,%function
723.align 4
724_bsaes_decrypt8:
725 adr $const,_bsaes_decrypt8
726 vldmia $key!, {@XMM[9]} @ round 0 key
727 add $const,$const,#.LM0ISR-_bsaes_decrypt8
728
729 vldmia $const!, {@XMM[8]} @ .LM0ISR
730 veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
731 veor @XMM[11], @XMM[1], @XMM[9]
732 vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
733 vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
734 veor @XMM[12], @XMM[2], @XMM[9]
735 vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
736 vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
737 veor @XMM[13], @XMM[3], @XMM[9]
738 vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
739 vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
740 veor @XMM[14], @XMM[4], @XMM[9]
741 vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
742 vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
743 veor @XMM[15], @XMM[5], @XMM[9]
744 vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
745 vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
746 veor @XMM[10], @XMM[6], @XMM[9]
747 vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
748 vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
749 veor @XMM[11], @XMM[7], @XMM[9]
750 vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
751 vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
752 vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
753 vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
754___
755 &bitslice (@XMM[0..7, 8..11]);
756$code.=<<___;
757 sub $rounds,$rounds,#1
758 b .Ldec_sbox
759.align 4
760.Ldec_loop:
761___
762 &ShiftRows (@XMM[0..7, 8..12]);
763$code.=".Ldec_sbox:\n";
764 &InvSbox (@XMM[0..7, 8..15]);
765$code.=<<___;
766 subs $rounds,$rounds,#1
767 bcc .Ldec_done
768___
769 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
770$code.=<<___;
771 vldmia $const, {@XMM[12]} @ .LISR
772 ite eq @ Thumb2 thing, sanity check in ARM
773 addeq $const,$const,#0x10
774 bne .Ldec_loop
775 vldmia $const, {@XMM[12]} @ .LISRM0
776 b .Ldec_loop
777.align 4
778.Ldec_done:
779___
780 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
781$code.=<<___;
782 vldmia $key, {@XMM[8]} @ last round key
783 veor @XMM[6], @XMM[6], @XMM[8]
784 veor @XMM[4], @XMM[4], @XMM[8]
785 veor @XMM[2], @XMM[2], @XMM[8]
786 veor @XMM[7], @XMM[7], @XMM[8]
787 veor @XMM[3], @XMM[3], @XMM[8]
788 veor @XMM[5], @XMM[5], @XMM[8]
789 veor @XMM[0], @XMM[0], @XMM[8]
790 veor @XMM[1], @XMM[1], @XMM[8]
791 bx lr
792.size _bsaes_decrypt8,.-_bsaes_decrypt8
793
794.type _bsaes_const,%object
795.align 6
796_bsaes_const:
797.LM0ISR: @ InvShiftRows constants
798 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
799.LISR:
800 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
801.LISRM0:
802 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
803.LM0SR: @ ShiftRows constants
804 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
805.LSR:
806 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
807.LSRM0:
808 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
809.LM0:
810 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
811.LREVM0SR:
812 .quad 0x090d01050c000408, 0x03070b0f060a0e02
813.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
814.align 6
815.size _bsaes_const,.-_bsaes_const
816
817.type _bsaes_encrypt8,%function
818.align 4
819_bsaes_encrypt8:
820 adr $const,_bsaes_encrypt8
821 vldmia $key!, {@XMM[9]} @ round 0 key
822 sub $const,$const,#_bsaes_encrypt8-.LM0SR
823
824 vldmia $const!, {@XMM[8]} @ .LM0SR
825_bsaes_encrypt8_alt:
826 veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
827 veor @XMM[11], @XMM[1], @XMM[9]
828 vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
829 vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
830 veor @XMM[12], @XMM[2], @XMM[9]
831 vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
832 vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
833 veor @XMM[13], @XMM[3], @XMM[9]
834 vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
835 vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
836 veor @XMM[14], @XMM[4], @XMM[9]
837 vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
838 vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
839 veor @XMM[15], @XMM[5], @XMM[9]
840 vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
841 vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
842 veor @XMM[10], @XMM[6], @XMM[9]
843 vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
844 vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
845 veor @XMM[11], @XMM[7], @XMM[9]
846 vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
847 vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
848 vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
849 vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
850_bsaes_encrypt8_bitslice:
851___
852 &bitslice (@XMM[0..7, 8..11]);
853$code.=<<___;
854 sub $rounds,$rounds,#1
855 b .Lenc_sbox
856.align 4
857.Lenc_loop:
858___
859 &ShiftRows (@XMM[0..7, 8..12]);
860$code.=".Lenc_sbox:\n";
861 &Sbox (@XMM[0..7, 8..15]);
862$code.=<<___;
863 subs $rounds,$rounds,#1
864 bcc .Lenc_done
865___
866 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
867$code.=<<___;
868 vldmia $const, {@XMM[12]} @ .LSR
869 ite eq @ Thumb2 thing, samity check in ARM
870 addeq $const,$const,#0x10
871 bne .Lenc_loop
872 vldmia $const, {@XMM[12]} @ .LSRM0
873 b .Lenc_loop
874.align 4
875.Lenc_done:
876___
877 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
878 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
879$code.=<<___;
880 vldmia $key, {@XMM[8]} @ last round key
881 veor @XMM[4], @XMM[4], @XMM[8]
882 veor @XMM[6], @XMM[6], @XMM[8]
883 veor @XMM[3], @XMM[3], @XMM[8]
884 veor @XMM[7], @XMM[7], @XMM[8]
885 veor @XMM[2], @XMM[2], @XMM[8]
886 veor @XMM[5], @XMM[5], @XMM[8]
887 veor @XMM[0], @XMM[0], @XMM[8]
888 veor @XMM[1], @XMM[1], @XMM[8]
889 bx lr
890.size _bsaes_encrypt8,.-_bsaes_encrypt8
891___
892}
893{
894my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
895
896sub bitslice_key {
897my @x=reverse(@_[0..7]);
898my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
899
900 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
901$code.=<<___;
902 @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
903 vmov @x[2], @x[0]
904 vmov @x[3], @x[1]
905___
906 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
907
908 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
909$code.=<<___;
910 @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
911 vmov @x[4], @x[0]
912 vmov @x[6], @x[2]
913 vmov @x[5], @x[1]
914 vmov @x[7], @x[3]
915___
916 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
917 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
918}
919
920$code.=<<___;
921.type _bsaes_key_convert,%function
922.align 4
923_bsaes_key_convert:
924 adr $const,_bsaes_key_convert
925 vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
926 sub $const,$const,#_bsaes_key_convert-.LM0
927 vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
928
929 vmov.i8 @XMM[8], #0x01 @ bit masks
930 vmov.i8 @XMM[9], #0x02
931 vmov.i8 @XMM[10], #0x04
932 vmov.i8 @XMM[11], #0x08
933 vmov.i8 @XMM[12], #0x10
934 vmov.i8 @XMM[13], #0x20
935 vldmia $const, {@XMM[14]} @ .LM0
936
937#ifdef __ARMEL__
938 vrev32.8 @XMM[7], @XMM[7]
939 vrev32.8 @XMM[15], @XMM[15]
940#endif
941 sub $rounds,$rounds,#1
942 vstmia $out!, {@XMM[7]} @ save round 0 key
943 b .Lkey_loop
944
945.align 4
946.Lkey_loop:
947 vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
948 vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
949 vmov.i8 @XMM[6], #0x40
950 vmov.i8 @XMM[15], #0x80
951
952 vtst.8 @XMM[0], @XMM[7], @XMM[8]
953 vtst.8 @XMM[1], @XMM[7], @XMM[9]
954 vtst.8 @XMM[2], @XMM[7], @XMM[10]
955 vtst.8 @XMM[3], @XMM[7], @XMM[11]
956 vtst.8 @XMM[4], @XMM[7], @XMM[12]
957 vtst.8 @XMM[5], @XMM[7], @XMM[13]
958 vtst.8 @XMM[6], @XMM[7], @XMM[6]
959 vtst.8 @XMM[7], @XMM[7], @XMM[15]
960 vld1.8 {@XMM[15]}, [$inp]! @ load next round key
961 vmvn @XMM[0], @XMM[0] @ "pnot"
962 vmvn @XMM[1], @XMM[1]
963 vmvn @XMM[5], @XMM[5]
964 vmvn @XMM[6], @XMM[6]
965#ifdef __ARMEL__
966 vrev32.8 @XMM[15], @XMM[15]
967#endif
968 subs $rounds,$rounds,#1
969 vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
970 bne .Lkey_loop
971
972 vmov.i8 @XMM[7],#0x63 @ compose .L63
973 @ don't save last round key
974 bx lr
975.size _bsaes_key_convert,.-_bsaes_key_convert
976___
977}
978
979if (0) { # following four functions are unsupported interface
980 # used for benchmarking...
981$code.=<<___;
982.globl bsaes_enc_key_convert
983.type bsaes_enc_key_convert,%function
984.align 4
985bsaes_enc_key_convert:
986 stmdb sp!,{r4-r6,lr}
987 vstmdb sp!,{d8-d15} @ ABI specification says so
988
989 ldr r5,[$inp,#240] @ pass rounds
990 mov r4,$inp @ pass key
991 mov r12,$out @ pass key schedule
992 bl _bsaes_key_convert
993 veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
994 vstmia r12, {@XMM[7]} @ save last round key
995
996 vldmia sp!,{d8-d15}
997 ldmia sp!,{r4-r6,pc}
998.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
999
1000.globl bsaes_encrypt_128
1001.type bsaes_encrypt_128,%function
1002.align 4
1003bsaes_encrypt_128:
1004 stmdb sp!,{r4-r6,lr}
1005 vstmdb sp!,{d8-d15} @ ABI specification says so
1006.Lenc128_loop:
1007 vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1008 vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1009 mov r4,$key @ pass the key
1010 vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1011 mov r5,#10 @ pass rounds
1012 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1013
1014 bl _bsaes_encrypt8
1015
1016 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1017 vst1.8 {@XMM[4]}, [$out]!
1018 vst1.8 {@XMM[6]}, [$out]!
1019 vst1.8 {@XMM[3]}, [$out]!
1020 vst1.8 {@XMM[7]}, [$out]!
1021 vst1.8 {@XMM[2]}, [$out]!
1022 subs $len,$len,#0x80
1023 vst1.8 {@XMM[5]}, [$out]!
1024 bhi .Lenc128_loop
1025
1026 vldmia sp!,{d8-d15}
1027 ldmia sp!,{r4-r6,pc}
1028.size bsaes_encrypt_128,.-bsaes_encrypt_128
1029
1030.globl bsaes_dec_key_convert
1031.type bsaes_dec_key_convert,%function
1032.align 4
1033bsaes_dec_key_convert:
1034 stmdb sp!,{r4-r6,lr}
1035 vstmdb sp!,{d8-d15} @ ABI specification says so
1036
1037 ldr r5,[$inp,#240] @ pass rounds
1038 mov r4,$inp @ pass key
1039 mov r12,$out @ pass key schedule
1040 bl _bsaes_key_convert
1041 vldmia $out, {@XMM[6]}
1042 vstmia r12, {@XMM[15]} @ save last round key
1043 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1044 vstmia $out, {@XMM[7]}
1045
1046 vldmia sp!,{d8-d15}
1047 ldmia sp!,{r4-r6,pc}
1048.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1049
1050.globl bsaes_decrypt_128
1051.type bsaes_decrypt_128,%function
1052.align 4
1053bsaes_decrypt_128:
1054 stmdb sp!,{r4-r6,lr}
1055 vstmdb sp!,{d8-d15} @ ABI specification says so
1056.Ldec128_loop:
1057 vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1058 vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1059 mov r4,$key @ pass the key
1060 vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1061 mov r5,#10 @ pass rounds
1062 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1063
1064 bl _bsaes_decrypt8
1065
1066 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1067 vst1.8 {@XMM[6]}, [$out]!
1068 vst1.8 {@XMM[4]}, [$out]!
1069 vst1.8 {@XMM[2]}, [$out]!
1070 vst1.8 {@XMM[7]}, [$out]!
1071 vst1.8 {@XMM[3]}, [$out]!
1072 subs $len,$len,#0x80
1073 vst1.8 {@XMM[5]}, [$out]!
1074 bhi .Ldec128_loop
1075
1076 vldmia sp!,{d8-d15}
1077 ldmia sp!,{r4-r6,pc}
1078.size bsaes_decrypt_128,.-bsaes_decrypt_128
1079___
1080}
1081{
1082my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
1083my ($keysched)=("sp");
1084
1085$code.=<<___;
1086.extern AES_cbc_encrypt
1087.extern AES_decrypt
1088
1089.global bsaes_cbc_encrypt
1090.type bsaes_cbc_encrypt,%function
1091.align 5
1092bsaes_cbc_encrypt:
1093#ifndef __KERNEL__
1094 cmp $len, #128
1095#ifndef __thumb__
1096 blo AES_cbc_encrypt
1097#else
1098 bhs 1f
1099 b AES_cbc_encrypt
11001:
1101#endif
1102#endif
1103
1104 @ it is up to the caller to make sure we are called with enc == 0
1105
1106 mov ip, sp
1107 stmdb sp!, {r4-r10, lr}
1108 VFP_ABI_PUSH
1109 ldr $ivp, [ip] @ IV is 1st arg on the stack
1110 mov $len, $len, lsr#4 @ len in 16 byte blocks
1111 sub sp, #0x10 @ scratch space to carry over the IV
1112 mov $fp, sp @ save sp
1113
1114 ldr $rounds, [$key, #240] @ get # of rounds
1115#ifndef BSAES_ASM_EXTENDED_KEY
1116 @ allocate the key schedule on the stack
1117 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1118 add r12, #`128-32` @ sifze of bit-slices key schedule
1119
1120 @ populate the key schedule
1121 mov r4, $key @ pass key
1122 mov r5, $rounds @ pass # of rounds
1123 mov sp, r12 @ sp is $keysched
1124 bl _bsaes_key_convert
1125 vldmia $keysched, {@XMM[6]}
1126 vstmia r12, {@XMM[15]} @ save last round key
1127 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1128 vstmia $keysched, {@XMM[7]}
1129#else
1130 ldr r12, [$key, #244]
1131 eors r12, #1
1132 beq 0f
1133
1134 @ populate the key schedule
1135 str r12, [$key, #244]
1136 mov r4, $key @ pass key
1137 mov r5, $rounds @ pass # of rounds
1138 add r12, $key, #248 @ pass key schedule
1139 bl _bsaes_key_convert
1140 add r4, $key, #248
1141 vldmia r4, {@XMM[6]}
1142 vstmia r12, {@XMM[15]} @ save last round key
1143 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1144 vstmia r4, {@XMM[7]}
1145
1146.align 2
11470:
1148#endif
1149
1150 vld1.8 {@XMM[15]}, [$ivp] @ load IV
1151 b .Lcbc_dec_loop
1152
1153.align 4
1154.Lcbc_dec_loop:
1155 subs $len, $len, #0x8
1156 bmi .Lcbc_dec_loop_finish
1157
1158 vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1159 vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1160#ifndef BSAES_ASM_EXTENDED_KEY
1161 mov r4, $keysched @ pass the key
1162#else
1163 add r4, $key, #248
1164#endif
1165 vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1166 mov r5, $rounds
1167 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
1168 sub $inp, $inp, #0x60
1169 vstmia $fp, {@XMM[15]} @ put aside IV
1170
1171 bl _bsaes_decrypt8
1172
1173 vldmia $fp, {@XMM[14]} @ reload IV
1174 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1175 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1176 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1177 veor @XMM[1], @XMM[1], @XMM[8]
1178 veor @XMM[6], @XMM[6], @XMM[9]
1179 vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1180 veor @XMM[4], @XMM[4], @XMM[10]
1181 veor @XMM[2], @XMM[2], @XMM[11]
1182 vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1183 veor @XMM[7], @XMM[7], @XMM[12]
1184 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1185 veor @XMM[3], @XMM[3], @XMM[13]
1186 vst1.8 {@XMM[6]}, [$out]!
1187 veor @XMM[5], @XMM[5], @XMM[14]
1188 vst1.8 {@XMM[4]}, [$out]!
1189 vst1.8 {@XMM[2]}, [$out]!
1190 vst1.8 {@XMM[7]}, [$out]!
1191 vst1.8 {@XMM[3]}, [$out]!
1192 vst1.8 {@XMM[5]}, [$out]!
1193
1194 b .Lcbc_dec_loop
1195
1196.Lcbc_dec_loop_finish:
1197 adds $len, $len, #8
1198 beq .Lcbc_dec_done
1199
1200 vld1.8 {@XMM[0]}, [$inp]! @ load input
1201 cmp $len, #2
1202 blo .Lcbc_dec_one
1203 vld1.8 {@XMM[1]}, [$inp]!
1204#ifndef BSAES_ASM_EXTENDED_KEY
1205 mov r4, $keysched @ pass the key
1206#else
1207 add r4, $key, #248
1208#endif
1209 mov r5, $rounds
1210 vstmia $fp, {@XMM[15]} @ put aside IV
1211 beq .Lcbc_dec_two
1212 vld1.8 {@XMM[2]}, [$inp]!
1213 cmp $len, #4
1214 blo .Lcbc_dec_three
1215 vld1.8 {@XMM[3]}, [$inp]!
1216 beq .Lcbc_dec_four
1217 vld1.8 {@XMM[4]}, [$inp]!
1218 cmp $len, #6
1219 blo .Lcbc_dec_five
1220 vld1.8 {@XMM[5]}, [$inp]!
1221 beq .Lcbc_dec_six
1222 vld1.8 {@XMM[6]}, [$inp]!
1223 sub $inp, $inp, #0x70
1224
1225 bl _bsaes_decrypt8
1226
1227 vldmia $fp, {@XMM[14]} @ reload IV
1228 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1229 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1230 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1231 veor @XMM[1], @XMM[1], @XMM[8]
1232 veor @XMM[6], @XMM[6], @XMM[9]
1233 vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1234 veor @XMM[4], @XMM[4], @XMM[10]
1235 veor @XMM[2], @XMM[2], @XMM[11]
1236 vld1.8 {@XMM[15]}, [$inp]!
1237 veor @XMM[7], @XMM[7], @XMM[12]
1238 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1239 veor @XMM[3], @XMM[3], @XMM[13]
1240 vst1.8 {@XMM[6]}, [$out]!
1241 vst1.8 {@XMM[4]}, [$out]!
1242 vst1.8 {@XMM[2]}, [$out]!
1243 vst1.8 {@XMM[7]}, [$out]!
1244 vst1.8 {@XMM[3]}, [$out]!
1245 b .Lcbc_dec_done
1246.align 4
1247.Lcbc_dec_six:
1248 sub $inp, $inp, #0x60
1249 bl _bsaes_decrypt8
1250 vldmia $fp,{@XMM[14]} @ reload IV
1251 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1252 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1253 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1254 veor @XMM[1], @XMM[1], @XMM[8]
1255 veor @XMM[6], @XMM[6], @XMM[9]
1256 vld1.8 {@XMM[12]}, [$inp]!
1257 veor @XMM[4], @XMM[4], @XMM[10]
1258 veor @XMM[2], @XMM[2], @XMM[11]
1259 vld1.8 {@XMM[15]}, [$inp]!
1260 veor @XMM[7], @XMM[7], @XMM[12]
1261 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1262 vst1.8 {@XMM[6]}, [$out]!
1263 vst1.8 {@XMM[4]}, [$out]!
1264 vst1.8 {@XMM[2]}, [$out]!
1265 vst1.8 {@XMM[7]}, [$out]!
1266 b .Lcbc_dec_done
1267.align 4
1268.Lcbc_dec_five:
1269 sub $inp, $inp, #0x50
1270 bl _bsaes_decrypt8
1271 vldmia $fp, {@XMM[14]} @ reload IV
1272 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1273 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1274 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1275 veor @XMM[1], @XMM[1], @XMM[8]
1276 veor @XMM[6], @XMM[6], @XMM[9]
1277 vld1.8 {@XMM[15]}, [$inp]!
1278 veor @XMM[4], @XMM[4], @XMM[10]
1279 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1280 veor @XMM[2], @XMM[2], @XMM[11]
1281 vst1.8 {@XMM[6]}, [$out]!
1282 vst1.8 {@XMM[4]}, [$out]!
1283 vst1.8 {@XMM[2]}, [$out]!
1284 b .Lcbc_dec_done
1285.align 4
1286.Lcbc_dec_four:
1287 sub $inp, $inp, #0x40
1288 bl _bsaes_decrypt8
1289 vldmia $fp, {@XMM[14]} @ reload IV
1290 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1291 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1292 vld1.8 {@XMM[10]}, [$inp]!
1293 veor @XMM[1], @XMM[1], @XMM[8]
1294 veor @XMM[6], @XMM[6], @XMM[9]
1295 vld1.8 {@XMM[15]}, [$inp]!
1296 veor @XMM[4], @XMM[4], @XMM[10]
1297 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1298 vst1.8 {@XMM[6]}, [$out]!
1299 vst1.8 {@XMM[4]}, [$out]!
1300 b .Lcbc_dec_done
1301.align 4
1302.Lcbc_dec_three:
1303 sub $inp, $inp, #0x30
1304 bl _bsaes_decrypt8
1305 vldmia $fp, {@XMM[14]} @ reload IV
1306 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
1307 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1308 vld1.8 {@XMM[15]}, [$inp]!
1309 veor @XMM[1], @XMM[1], @XMM[8]
1310 veor @XMM[6], @XMM[6], @XMM[9]
1311 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1312 vst1.8 {@XMM[6]}, [$out]!
1313 b .Lcbc_dec_done
1314.align 4
1315.Lcbc_dec_two:
1316 sub $inp, $inp, #0x20
1317 bl _bsaes_decrypt8
1318 vldmia $fp, {@XMM[14]} @ reload IV
1319 vld1.8 {@XMM[8]}, [$inp]! @ reload input
1320 veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
1321 vld1.8 {@XMM[15]}, [$inp]! @ reload input
1322 veor @XMM[1], @XMM[1], @XMM[8]
1323 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1324 b .Lcbc_dec_done
1325.align 4
1326.Lcbc_dec_one:
1327 sub $inp, $inp, #0x10
1328 mov $rounds, $out @ save original out pointer
1329 mov $out, $fp @ use the iv scratch space as out buffer
1330 mov r2, $key
1331 vmov @XMM[4],@XMM[15] @ just in case ensure that IV
1332 vmov @XMM[5],@XMM[0] @ and input are preserved
1333 bl AES_decrypt
1334 vld1.8 {@XMM[0]}, [$fp,:64] @ load result
1335 veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
1336 vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
1337 vst1.8 {@XMM[0]}, [$rounds] @ write output
1338
1339.Lcbc_dec_done:
1340#ifndef BSAES_ASM_EXTENDED_KEY
1341 vmov.i32 q0, #0
1342 vmov.i32 q1, #0
1343.Lcbc_dec_bzero: @ wipe key schedule [if any]
1344 vstmia $keysched!, {q0-q1}
1345 cmp $keysched, $fp
1346 bne .Lcbc_dec_bzero
1347#endif
1348
1349 mov sp, $fp
1350 add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1351 vst1.8 {@XMM[15]}, [$ivp] @ return IV
1352 VFP_ABI_POP
1353 ldmia sp!, {r4-r10, pc}
1354.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1355___
1356}
1357{
1358my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
1359my $const = "r6"; # shared with _bsaes_encrypt8_alt
1360my $keysched = "sp";
1361
1362$code.=<<___;
1363.extern AES_encrypt
1364.global bsaes_ctr32_encrypt_blocks
1365.type bsaes_ctr32_encrypt_blocks,%function
1366.align 5
1367bsaes_ctr32_encrypt_blocks:
1368 cmp $len, #8 @ use plain AES for
1369 blo .Lctr_enc_short @ small sizes
1370
1371 mov ip, sp
1372 stmdb sp!, {r4-r10, lr}
1373 VFP_ABI_PUSH
1374 ldr $ctr, [ip] @ ctr is 1st arg on the stack
1375 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1376 mov $fp, sp @ save sp
1377
1378 ldr $rounds, [$key, #240] @ get # of rounds
1379#ifndef BSAES_ASM_EXTENDED_KEY
1380 @ allocate the key schedule on the stack
1381 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1382 add r12, #`128-32` @ size of bit-sliced key schedule
1383
1384 @ populate the key schedule
1385 mov r4, $key @ pass key
1386 mov r5, $rounds @ pass # of rounds
1387 mov sp, r12 @ sp is $keysched
1388 bl _bsaes_key_convert
1389 veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1390 vstmia r12, {@XMM[7]} @ save last round key
1391
1392 vld1.8 {@XMM[0]}, [$ctr] @ load counter
1393 add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
1394 vldmia $keysched, {@XMM[4]} @ load round0 key
1395#else
1396 ldr r12, [$key, #244]
1397 eors r12, #1
1398 beq 0f
1399
1400 @ populate the key schedule
1401 str r12, [$key, #244]
1402 mov r4, $key @ pass key
1403 mov r5, $rounds @ pass # of rounds
1404 add r12, $key, #248 @ pass key schedule
1405 bl _bsaes_key_convert
1406 veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1407 vstmia r12, {@XMM[7]} @ save last round key
1408
1409.align 2
14100: add r12, $key, #248
1411 vld1.8 {@XMM[0]}, [$ctr] @ load counter
1412 adrl $ctr, .LREVM0SR @ borrow $ctr
1413 vldmia r12, {@XMM[4]} @ load round0 key
1414 sub sp, #0x10 @ place for adjusted round0 key
1415#endif
1416
1417 vmov.i32 @XMM[8],#1 @ compose 1<<96
1418 veor @XMM[9],@XMM[9],@XMM[9]
1419 vrev32.8 @XMM[0],@XMM[0]
1420 vext.8 @XMM[8],@XMM[9],@XMM[8],#4
1421 vrev32.8 @XMM[4],@XMM[4]
1422 vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1423 vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
1424 b .Lctr_enc_loop
1425
1426.align 4
1427.Lctr_enc_loop:
1428 vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
1429 vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
1430 vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
1431 vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
1432 vadd.u32 @XMM[4], @XMM[1], @XMM[10]
1433 vadd.u32 @XMM[5], @XMM[2], @XMM[10]
1434 vadd.u32 @XMM[6], @XMM[3], @XMM[10]
1435 vadd.u32 @XMM[7], @XMM[4], @XMM[10]
1436 vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
1437
1438 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1439 @ to flip byte order in 32-bit counter
1440
1441 vldmia $keysched, {@XMM[9]} @ load round0 key
1442#ifndef BSAES_ASM_EXTENDED_KEY
1443 add r4, $keysched, #0x10 @ pass next round key
1444#else
1445 add r4, $key, #`248+16`
1446#endif
1447 vldmia $ctr, {@XMM[8]} @ .LREVM0SR
1448 mov r5, $rounds @ pass rounds
1449 vstmia $fp, {@XMM[10]} @ save next counter
1450 sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
1451
1452 bl _bsaes_encrypt8_alt
1453
1454 subs $len, $len, #8
1455 blo .Lctr_enc_loop_done
1456
1457 vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
1458 vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1459 veor @XMM[0], @XMM[8]
1460 veor @XMM[1], @XMM[9]
1461 vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1462 veor @XMM[4], @XMM[10]
1463 veor @XMM[6], @XMM[11]
1464 vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1465 veor @XMM[3], @XMM[12]
1466 vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1467 veor @XMM[7], @XMM[13]
1468 veor @XMM[2], @XMM[14]
1469 vst1.8 {@XMM[4]}, [$out]!
1470 veor @XMM[5], @XMM[15]
1471 vst1.8 {@XMM[6]}, [$out]!
1472 vmov.i32 @XMM[8], #1 @ compose 1<<96
1473 vst1.8 {@XMM[3]}, [$out]!
1474 veor @XMM[9], @XMM[9], @XMM[9]
1475 vst1.8 {@XMM[7]}, [$out]!
1476 vext.8 @XMM[8], @XMM[9], @XMM[8], #4
1477 vst1.8 {@XMM[2]}, [$out]!
1478 vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1479 vst1.8 {@XMM[5]}, [$out]!
1480 vldmia $fp, {@XMM[0]} @ load counter
1481
1482 bne .Lctr_enc_loop
1483 b .Lctr_enc_done
1484
1485.align 4
1486.Lctr_enc_loop_done:
1487 add $len, $len, #8
1488 vld1.8 {@XMM[8]}, [$inp]! @ load input
1489 veor @XMM[0], @XMM[8]
1490 vst1.8 {@XMM[0]}, [$out]! @ write output
1491 cmp $len, #2
1492 blo .Lctr_enc_done
1493 vld1.8 {@XMM[9]}, [$inp]!
1494 veor @XMM[1], @XMM[9]
1495 vst1.8 {@XMM[1]}, [$out]!
1496 beq .Lctr_enc_done
1497 vld1.8 {@XMM[10]}, [$inp]!
1498 veor @XMM[4], @XMM[10]
1499 vst1.8 {@XMM[4]}, [$out]!
1500 cmp $len, #4
1501 blo .Lctr_enc_done
1502 vld1.8 {@XMM[11]}, [$inp]!
1503 veor @XMM[6], @XMM[11]
1504 vst1.8 {@XMM[6]}, [$out]!
1505 beq .Lctr_enc_done
1506 vld1.8 {@XMM[12]}, [$inp]!
1507 veor @XMM[3], @XMM[12]
1508 vst1.8 {@XMM[3]}, [$out]!
1509 cmp $len, #6
1510 blo .Lctr_enc_done
1511 vld1.8 {@XMM[13]}, [$inp]!
1512 veor @XMM[7], @XMM[13]
1513 vst1.8 {@XMM[7]}, [$out]!
1514 beq .Lctr_enc_done
1515 vld1.8 {@XMM[14]}, [$inp]
1516 veor @XMM[2], @XMM[14]
1517 vst1.8 {@XMM[2]}, [$out]!
1518
1519.Lctr_enc_done:
1520 vmov.i32 q0, #0
1521 vmov.i32 q1, #0
1522#ifndef BSAES_ASM_EXTENDED_KEY
1523.Lctr_enc_bzero: @ wipe key schedule [if any]
1524 vstmia $keysched!, {q0-q1}
1525 cmp $keysched, $fp
1526 bne .Lctr_enc_bzero
1527#else
1528 vstmia $keysched, {q0-q1}
1529#endif
1530
1531 mov sp, $fp
1532 add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1533 VFP_ABI_POP
1534 ldmia sp!, {r4-r10, pc} @ return
1535
1536.align 4
1537.Lctr_enc_short:
1538 ldr ip, [sp] @ ctr pointer is passed on stack
1539 stmdb sp!, {r4-r8, lr}
1540
1541 mov r4, $inp @ copy arguments
1542 mov r5, $out
1543 mov r6, $len
1544 mov r7, $key
1545 ldr r8, [ip, #12] @ load counter LSW
1546 vld1.8 {@XMM[1]}, [ip] @ load whole counter value
1547#ifdef __ARMEL__
1548 rev r8, r8
1549#endif
1550 sub sp, sp, #0x10
1551 vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
1552 sub sp, sp, #0x10
1553
1554.Lctr_enc_short_loop:
1555 add r0, sp, #0x10 @ input counter value
1556 mov r1, sp @ output on the stack
1557 mov r2, r7 @ key
1558
1559 bl AES_encrypt
1560
1561 vld1.8 {@XMM[0]}, [r4]! @ load input
1562 vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
1563 add r8, r8, #1
1564#ifdef __ARMEL__
1565 rev r0, r8
1566 str r0, [sp, #0x1c] @ next counter value
1567#else
1568 str r8, [sp, #0x1c] @ next counter value
1569#endif
1570 veor @XMM[0],@XMM[0],@XMM[1]
1571 vst1.8 {@XMM[0]}, [r5]! @ store output
1572 subs r6, r6, #1
1573 bne .Lctr_enc_short_loop
1574
1575 vmov.i32 q0, #0
1576 vmov.i32 q1, #0
1577 vstmia sp!, {q0-q1}
1578
1579 ldmia sp!, {r4-r8, pc}
1580.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1581___
1582}
1583{
1584######################################################################
1585# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1586# const AES_KEY *key1, const AES_KEY *key2,
1587# const unsigned char iv[16]);
1588#
1589my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
1590my $const="r6"; # returned by _bsaes_key_convert
1591my $twmask=@XMM[5];
1592my @T=@XMM[6..7];
1593
1594$code.=<<___;
1595.globl bsaes_xts_encrypt
1596.type bsaes_xts_encrypt,%function
1597.align 4
1598bsaes_xts_encrypt:
1599 mov ip, sp
1600 stmdb sp!, {r4-r10, lr} @ 0x20
1601 VFP_ABI_PUSH
1602 mov r6, sp @ future $fp
1603
1604 mov $inp, r0
1605 mov $out, r1
1606 mov $len, r2
1607 mov $key, r3
1608
1609 sub r0, sp, #0x10 @ 0x10
1610 bic r0, #0xf @ align at 16 bytes
1611 mov sp, r0
1612
1613#ifdef XTS_CHAIN_TWEAK
1614 ldr r0, [ip] @ pointer to input tweak
1615#else
1616 @ generate initial tweak
1617 ldr r0, [ip, #4] @ iv[]
1618 mov r1, sp
1619 ldr r2, [ip, #0] @ key2
1620 bl AES_encrypt
1621 mov r0,sp @ pointer to initial tweak
1622#endif
1623
1624 ldr $rounds, [$key, #240] @ get # of rounds
1625 mov $fp, r6
1626#ifndef BSAES_ASM_EXTENDED_KEY
1627 @ allocate the key schedule on the stack
1628 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1629 @ add r12, #`128-32` @ size of bit-sliced key schedule
1630 sub r12, #`32+16` @ place for tweak[9]
1631
1632 @ populate the key schedule
1633 mov r4, $key @ pass key
1634 mov r5, $rounds @ pass # of rounds
1635 mov sp, r12
1636 add r12, #0x90 @ pass key schedule
1637 bl _bsaes_key_convert
1638 veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
1639 vstmia r12, {@XMM[7]} @ save last round key
1640#else
1641 ldr r12, [$key, #244]
1642 eors r12, #1
1643 beq 0f
1644
1645 str r12, [$key, #244]
1646 mov r4, $key @ pass key
1647 mov r5, $rounds @ pass # of rounds
1648 add r12, $key, #248 @ pass key schedule
1649 bl _bsaes_key_convert
1650 veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
1651 vstmia r12, {@XMM[7]}
1652
1653.align 2
16540: sub sp, #0x90 @ place for tweak[9]
1655#endif
1656
1657 vld1.8 {@XMM[8]}, [r0] @ initial tweak
1658 adr $magic, .Lxts_magic
1659
1660 subs $len, #0x80
1661 blo .Lxts_enc_short
1662 b .Lxts_enc_loop
1663
1664.align 4
1665.Lxts_enc_loop:
1666 vldmia $magic, {$twmask} @ load XTS magic
1667 vshr.s64 @T[0], @XMM[8], #63
1668 mov r0, sp
1669 vand @T[0], @T[0], $twmask
1670___
1671for($i=9;$i<16;$i++) {
1672$code.=<<___;
1673 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1674 vst1.64 {@XMM[$i-1]}, [r0,:128]!
1675 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1676 vshr.s64 @T[1], @XMM[$i], #63
1677 veor @XMM[$i], @XMM[$i], @T[0]
1678 vand @T[1], @T[1], $twmask
1679___
1680 @T=reverse(@T);
1681
1682$code.=<<___ if ($i>=10);
1683 vld1.8 {@XMM[$i-10]}, [$inp]!
1684___
1685$code.=<<___ if ($i>=11);
1686 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1687___
1688}
1689$code.=<<___;
1690 vadd.u64 @XMM[8], @XMM[15], @XMM[15]
1691 vst1.64 {@XMM[15]}, [r0,:128]!
1692 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1693 veor @XMM[8], @XMM[8], @T[0]
1694 vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1695
1696 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1697 veor @XMM[5], @XMM[5], @XMM[13]
1698#ifndef BSAES_ASM_EXTENDED_KEY
1699 add r4, sp, #0x90 @ pass key schedule
1700#else
1701 add r4, $key, #248 @ pass key schedule
1702#endif
1703 veor @XMM[6], @XMM[6], @XMM[14]
1704 mov r5, $rounds @ pass rounds
1705 veor @XMM[7], @XMM[7], @XMM[15]
1706 mov r0, sp
1707
1708 bl _bsaes_encrypt8
1709
1710 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1711 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1712 veor @XMM[0], @XMM[0], @XMM[ 8]
1713 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1714 veor @XMM[1], @XMM[1], @XMM[ 9]
1715 veor @XMM[8], @XMM[4], @XMM[10]
1716 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1717 veor @XMM[9], @XMM[6], @XMM[11]
1718 vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
1719 veor @XMM[10], @XMM[3], @XMM[12]
1720 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1721 veor @XMM[11], @XMM[7], @XMM[13]
1722 veor @XMM[12], @XMM[2], @XMM[14]
1723 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1724 veor @XMM[13], @XMM[5], @XMM[15]
1725 vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
1726
1727 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1728
1729 subs $len, #0x80
1730 bpl .Lxts_enc_loop
1731
1732.Lxts_enc_short:
1733 adds $len, #0x70
1734 bmi .Lxts_enc_done
1735
1736 vldmia $magic, {$twmask} @ load XTS magic
1737 vshr.s64 @T[0], @XMM[8], #63
1738 mov r0, sp
1739 vand @T[0], @T[0], $twmask
1740___
1741for($i=9;$i<16;$i++) {
1742$code.=<<___;
1743 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
1744 vst1.64 {@XMM[$i-1]}, [r0,:128]!
1745 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
1746 vshr.s64 @T[1], @XMM[$i], #63
1747 veor @XMM[$i], @XMM[$i], @T[0]
1748 vand @T[1], @T[1], $twmask
1749___
1750 @T=reverse(@T);
1751
1752$code.=<<___ if ($i>=10);
1753 vld1.8 {@XMM[$i-10]}, [$inp]!
1754 subs $len, #0x10
1755 bmi .Lxts_enc_`$i-9`
1756___
1757$code.=<<___ if ($i>=11);
1758 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
1759___
1760}
1761$code.=<<___;
1762 sub $len, #0x10
1763 vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
1764
1765 vld1.8 {@XMM[6]}, [$inp]!
1766 veor @XMM[5], @XMM[5], @XMM[13]
1767#ifndef BSAES_ASM_EXTENDED_KEY
1768 add r4, sp, #0x90 @ pass key schedule
1769#else
1770 add r4, $key, #248 @ pass key schedule
1771#endif
1772 veor @XMM[6], @XMM[6], @XMM[14]
1773 mov r5, $rounds @ pass rounds
1774 mov r0, sp
1775
1776 bl _bsaes_encrypt8
1777
1778 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1779 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1780 veor @XMM[0], @XMM[0], @XMM[ 8]
1781 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1782 veor @XMM[1], @XMM[1], @XMM[ 9]
1783 veor @XMM[8], @XMM[4], @XMM[10]
1784 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1785 veor @XMM[9], @XMM[6], @XMM[11]
1786 vld1.64 {@XMM[14]}, [r0,:128]!
1787 veor @XMM[10], @XMM[3], @XMM[12]
1788 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1789 veor @XMM[11], @XMM[7], @XMM[13]
1790 veor @XMM[12], @XMM[2], @XMM[14]
1791 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1792 vst1.8 {@XMM[12]}, [$out]!
1793
1794 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1795 b .Lxts_enc_done
1796.align 4
1797.Lxts_enc_6:
1798 vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
1799
1800 veor @XMM[4], @XMM[4], @XMM[12]
1801#ifndef BSAES_ASM_EXTENDED_KEY
1802 add r4, sp, #0x90 @ pass key schedule
1803#else
1804 add r4, $key, #248 @ pass key schedule
1805#endif
1806 veor @XMM[5], @XMM[5], @XMM[13]
1807 mov r5, $rounds @ pass rounds
1808 mov r0, sp
1809
1810 bl _bsaes_encrypt8
1811
1812 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1813 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1814 veor @XMM[0], @XMM[0], @XMM[ 8]
1815 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
1816 veor @XMM[1], @XMM[1], @XMM[ 9]
1817 veor @XMM[8], @XMM[4], @XMM[10]
1818 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1819 veor @XMM[9], @XMM[6], @XMM[11]
1820 veor @XMM[10], @XMM[3], @XMM[12]
1821 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1822 veor @XMM[11], @XMM[7], @XMM[13]
1823 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
1824
1825 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1826 b .Lxts_enc_done
1827
1828@ put this in range for both ARM and Thumb mode adr instructions
1829.align 5
1830.Lxts_magic:
1831 .quad 1, 0x87
1832
1833.align 5
1834.Lxts_enc_5:
1835 vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
1836
1837 veor @XMM[3], @XMM[3], @XMM[11]
1838#ifndef BSAES_ASM_EXTENDED_KEY
1839 add r4, sp, #0x90 @ pass key schedule
1840#else
1841 add r4, $key, #248 @ pass key schedule
1842#endif
1843 veor @XMM[4], @XMM[4], @XMM[12]
1844 mov r5, $rounds @ pass rounds
1845 mov r0, sp
1846
1847 bl _bsaes_encrypt8
1848
1849 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1850 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1851 veor @XMM[0], @XMM[0], @XMM[ 8]
1852 vld1.64 {@XMM[12]}, [r0,:128]!
1853 veor @XMM[1], @XMM[1], @XMM[ 9]
1854 veor @XMM[8], @XMM[4], @XMM[10]
1855 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1856 veor @XMM[9], @XMM[6], @XMM[11]
1857 veor @XMM[10], @XMM[3], @XMM[12]
1858 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1859 vst1.8 {@XMM[10]}, [$out]!
1860
1861 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1862 b .Lxts_enc_done
1863.align 4
1864.Lxts_enc_4:
1865 vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
1866
1867 veor @XMM[2], @XMM[2], @XMM[10]
1868#ifndef BSAES_ASM_EXTENDED_KEY
1869 add r4, sp, #0x90 @ pass key schedule
1870#else
1871 add r4, $key, #248 @ pass key schedule
1872#endif
1873 veor @XMM[3], @XMM[3], @XMM[11]
1874 mov r5, $rounds @ pass rounds
1875 mov r0, sp
1876
1877 bl _bsaes_encrypt8
1878
1879 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
1880 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
1881 veor @XMM[0], @XMM[0], @XMM[ 8]
1882 veor @XMM[1], @XMM[1], @XMM[ 9]
1883 veor @XMM[8], @XMM[4], @XMM[10]
1884 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1885 veor @XMM[9], @XMM[6], @XMM[11]
1886 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
1887
1888 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1889 b .Lxts_enc_done
1890.align 4
1891.Lxts_enc_3:
1892 vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
1893
1894 veor @XMM[1], @XMM[1], @XMM[9]
1895#ifndef BSAES_ASM_EXTENDED_KEY
1896 add r4, sp, #0x90 @ pass key schedule
1897#else
1898 add r4, $key, #248 @ pass key schedule
1899#endif
1900 veor @XMM[2], @XMM[2], @XMM[10]
1901 mov r5, $rounds @ pass rounds
1902 mov r0, sp
1903
1904 bl _bsaes_encrypt8
1905
1906 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
1907 vld1.64 {@XMM[10]}, [r0,:128]!
1908 veor @XMM[0], @XMM[0], @XMM[ 8]
1909 veor @XMM[1], @XMM[1], @XMM[ 9]
1910 veor @XMM[8], @XMM[4], @XMM[10]
1911 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1912 vst1.8 {@XMM[8]}, [$out]!
1913
1914 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1915 b .Lxts_enc_done
1916.align 4
1917.Lxts_enc_2:
1918 vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
1919
1920 veor @XMM[0], @XMM[0], @XMM[8]
1921#ifndef BSAES_ASM_EXTENDED_KEY
1922 add r4, sp, #0x90 @ pass key schedule
1923#else
1924 add r4, $key, #248 @ pass key schedule
1925#endif
1926 veor @XMM[1], @XMM[1], @XMM[9]
1927 mov r5, $rounds @ pass rounds
1928 mov r0, sp
1929
1930 bl _bsaes_encrypt8
1931
1932 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
1933 veor @XMM[0], @XMM[0], @XMM[ 8]
1934 veor @XMM[1], @XMM[1], @XMM[ 9]
1935 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
1936
1937 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
1938 b .Lxts_enc_done
1939.align 4
1940.Lxts_enc_1:
1941 mov r0, sp
1942 veor @XMM[0], @XMM[8]
1943 mov r1, sp
1944 vst1.8 {@XMM[0]}, [sp,:128]
1945 mov r2, $key
1946 mov r4, $fp @ preserve fp
1947
1948 bl AES_encrypt
1949
1950 vld1.8 {@XMM[0]}, [sp,:128]
1951 veor @XMM[0], @XMM[0], @XMM[8]
1952 vst1.8 {@XMM[0]}, [$out]!
1953 mov $fp, r4
1954
1955 vmov @XMM[8], @XMM[9] @ next round tweak
1956
1957.Lxts_enc_done:
1958#ifndef XTS_CHAIN_TWEAK
1959 adds $len, #0x10
1960 beq .Lxts_enc_ret
1961 sub r6, $out, #0x10
1962
1963.Lxts_enc_steal:
1964 ldrb r0, [$inp], #1
1965 ldrb r1, [$out, #-0x10]
1966 strb r0, [$out, #-0x10]
1967 strb r1, [$out], #1
1968
1969 subs $len, #1
1970 bhi .Lxts_enc_steal
1971
1972 vld1.8 {@XMM[0]}, [r6]
1973 mov r0, sp
1974 veor @XMM[0], @XMM[0], @XMM[8]
1975 mov r1, sp
1976 vst1.8 {@XMM[0]}, [sp,:128]
1977 mov r2, $key
1978 mov r4, $fp @ preserve fp
1979
1980 bl AES_encrypt
1981
1982 vld1.8 {@XMM[0]}, [sp,:128]
1983 veor @XMM[0], @XMM[0], @XMM[8]
1984 vst1.8 {@XMM[0]}, [r6]
1985 mov $fp, r4
1986#endif
1987
1988.Lxts_enc_ret:
1989 bic r0, $fp, #0xf
1990 vmov.i32 q0, #0
1991 vmov.i32 q1, #0
1992#ifdef XTS_CHAIN_TWEAK
1993 ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
1994#endif
1995.Lxts_enc_bzero: @ wipe key schedule [if any]
1996 vstmia sp!, {q0-q1}
1997 cmp sp, r0
1998 bne .Lxts_enc_bzero
1999
2000 mov sp, $fp
2001#ifdef XTS_CHAIN_TWEAK
2002 vst1.8 {@XMM[8]}, [r1]
2003#endif
2004 VFP_ABI_POP
2005 ldmia sp!, {r4-r10, pc} @ return
2006
2007.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2008
2009.globl bsaes_xts_decrypt
2010.type bsaes_xts_decrypt,%function
2011.align 4
2012bsaes_xts_decrypt:
2013 mov ip, sp
2014 stmdb sp!, {r4-r10, lr} @ 0x20
2015 VFP_ABI_PUSH
2016 mov r6, sp @ future $fp
2017
2018 mov $inp, r0
2019 mov $out, r1
2020 mov $len, r2
2021 mov $key, r3
2022
2023 sub r0, sp, #0x10 @ 0x10
2024 bic r0, #0xf @ align at 16 bytes
2025 mov sp, r0
2026
2027#ifdef XTS_CHAIN_TWEAK
2028 ldr r0, [ip] @ pointer to input tweak
2029#else
2030 @ generate initial tweak
2031 ldr r0, [ip, #4] @ iv[]
2032 mov r1, sp
2033 ldr r2, [ip, #0] @ key2
2034 bl AES_encrypt
2035 mov r0, sp @ pointer to initial tweak
2036#endif
2037
2038 ldr $rounds, [$key, #240] @ get # of rounds
2039 mov $fp, r6
2040#ifndef BSAES_ASM_EXTENDED_KEY
2041 @ allocate the key schedule on the stack
2042 sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
2043 @ add r12, #`128-32` @ size of bit-sliced key schedule
2044 sub r12, #`32+16` @ place for tweak[9]
2045
2046 @ populate the key schedule
2047 mov r4, $key @ pass key
2048 mov r5, $rounds @ pass # of rounds
2049 mov sp, r12
2050 add r12, #0x90 @ pass key schedule
2051 bl _bsaes_key_convert
2052 add r4, sp, #0x90
2053 vldmia r4, {@XMM[6]}
2054 vstmia r12, {@XMM[15]} @ save last round key
2055 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
2056 vstmia r4, {@XMM[7]}
2057#else
2058 ldr r12, [$key, #244]
2059 eors r12, #1
2060 beq 0f
2061
2062 str r12, [$key, #244]
2063 mov r4, $key @ pass key
2064 mov r5, $rounds @ pass # of rounds
2065 add r12, $key, #248 @ pass key schedule
2066 bl _bsaes_key_convert
2067 add r4, $key, #248
2068 vldmia r4, {@XMM[6]}
2069 vstmia r12, {@XMM[15]} @ save last round key
2070 veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
2071 vstmia r4, {@XMM[7]}
2072
2073.align 2
20740: sub sp, #0x90 @ place for tweak[9]
2075#endif
2076 vld1.8 {@XMM[8]}, [r0] @ initial tweak
2077 adr $magic, .Lxts_magic
2078
2079 tst $len, #0xf @ if not multiple of 16
2080 it ne @ Thumb2 thing, sanity check in ARM
2081 subne $len, #0x10 @ subtract another 16 bytes
2082 subs $len, #0x80
2083
2084 blo .Lxts_dec_short
2085 b .Lxts_dec_loop
2086
2087.align 4
2088.Lxts_dec_loop:
2089 vldmia $magic, {$twmask} @ load XTS magic
2090 vshr.s64 @T[0], @XMM[8], #63
2091 mov r0, sp
2092 vand @T[0], @T[0], $twmask
2093___
2094for($i=9;$i<16;$i++) {
2095$code.=<<___;
2096 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2097 vst1.64 {@XMM[$i-1]}, [r0,:128]!
2098 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2099 vshr.s64 @T[1], @XMM[$i], #63
2100 veor @XMM[$i], @XMM[$i], @T[0]
2101 vand @T[1], @T[1], $twmask
2102___
2103 @T=reverse(@T);
2104
2105$code.=<<___ if ($i>=10);
2106 vld1.8 {@XMM[$i-10]}, [$inp]!
2107___
2108$code.=<<___ if ($i>=11);
2109 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2110___
2111}
2112$code.=<<___;
2113 vadd.u64 @XMM[8], @XMM[15], @XMM[15]
2114 vst1.64 {@XMM[15]}, [r0,:128]!
2115 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2116 veor @XMM[8], @XMM[8], @T[0]
2117 vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2118
2119 vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
2120 veor @XMM[5], @XMM[5], @XMM[13]
2121#ifndef BSAES_ASM_EXTENDED_KEY
2122 add r4, sp, #0x90 @ pass key schedule
2123#else
2124 add r4, $key, #248 @ pass key schedule
2125#endif
2126 veor @XMM[6], @XMM[6], @XMM[14]
2127 mov r5, $rounds @ pass rounds
2128 veor @XMM[7], @XMM[7], @XMM[15]
2129 mov r0, sp
2130
2131 bl _bsaes_decrypt8
2132
2133 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2134 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2135 veor @XMM[0], @XMM[0], @XMM[ 8]
2136 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2137 veor @XMM[1], @XMM[1], @XMM[ 9]
2138 veor @XMM[8], @XMM[6], @XMM[10]
2139 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2140 veor @XMM[9], @XMM[4], @XMM[11]
2141 vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
2142 veor @XMM[10], @XMM[2], @XMM[12]
2143 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2144 veor @XMM[11], @XMM[7], @XMM[13]
2145 veor @XMM[12], @XMM[3], @XMM[14]
2146 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2147 veor @XMM[13], @XMM[5], @XMM[15]
2148 vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
2149
2150 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2151
2152 subs $len, #0x80
2153 bpl .Lxts_dec_loop
2154
2155.Lxts_dec_short:
2156 adds $len, #0x70
2157 bmi .Lxts_dec_done
2158
2159 vldmia $magic, {$twmask} @ load XTS magic
2160 vshr.s64 @T[0], @XMM[8], #63
2161 mov r0, sp
2162 vand @T[0], @T[0], $twmask
2163___
2164for($i=9;$i<16;$i++) {
2165$code.=<<___;
2166 vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
2167 vst1.64 {@XMM[$i-1]}, [r0,:128]!
2168 vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
2169 vshr.s64 @T[1], @XMM[$i], #63
2170 veor @XMM[$i], @XMM[$i], @T[0]
2171 vand @T[1], @T[1], $twmask
2172___
2173 @T=reverse(@T);
2174
2175$code.=<<___ if ($i>=10);
2176 vld1.8 {@XMM[$i-10]}, [$inp]!
2177 subs $len, #0x10
2178 bmi .Lxts_dec_`$i-9`
2179___
2180$code.=<<___ if ($i>=11);
2181 veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
2182___
2183}
2184$code.=<<___;
2185 sub $len, #0x10
2186 vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
2187
2188 vld1.8 {@XMM[6]}, [$inp]!
2189 veor @XMM[5], @XMM[5], @XMM[13]
2190#ifndef BSAES_ASM_EXTENDED_KEY
2191 add r4, sp, #0x90 @ pass key schedule
2192#else
2193 add r4, $key, #248 @ pass key schedule
2194#endif
2195 veor @XMM[6], @XMM[6], @XMM[14]
2196 mov r5, $rounds @ pass rounds
2197 mov r0, sp
2198
2199 bl _bsaes_decrypt8
2200
2201 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2202 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2203 veor @XMM[0], @XMM[0], @XMM[ 8]
2204 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2205 veor @XMM[1], @XMM[1], @XMM[ 9]
2206 veor @XMM[8], @XMM[6], @XMM[10]
2207 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2208 veor @XMM[9], @XMM[4], @XMM[11]
2209 vld1.64 {@XMM[14]}, [r0,:128]!
2210 veor @XMM[10], @XMM[2], @XMM[12]
2211 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2212 veor @XMM[11], @XMM[7], @XMM[13]
2213 veor @XMM[12], @XMM[3], @XMM[14]
2214 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2215 vst1.8 {@XMM[12]}, [$out]!
2216
2217 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2218 b .Lxts_dec_done
2219.align 4
2220.Lxts_dec_6:
2221 vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
2222
2223 veor @XMM[4], @XMM[4], @XMM[12]
2224#ifndef BSAES_ASM_EXTENDED_KEY
2225 add r4, sp, #0x90 @ pass key schedule
2226#else
2227 add r4, $key, #248 @ pass key schedule
2228#endif
2229 veor @XMM[5], @XMM[5], @XMM[13]
2230 mov r5, $rounds @ pass rounds
2231 mov r0, sp
2232
2233 bl _bsaes_decrypt8
2234
2235 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2236 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2237 veor @XMM[0], @XMM[0], @XMM[ 8]
2238 vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
2239 veor @XMM[1], @XMM[1], @XMM[ 9]
2240 veor @XMM[8], @XMM[6], @XMM[10]
2241 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2242 veor @XMM[9], @XMM[4], @XMM[11]
2243 veor @XMM[10], @XMM[2], @XMM[12]
2244 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2245 veor @XMM[11], @XMM[7], @XMM[13]
2246 vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
2247
2248 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2249 b .Lxts_dec_done
2250.align 4
2251.Lxts_dec_5:
2252 vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
2253
2254 veor @XMM[3], @XMM[3], @XMM[11]
2255#ifndef BSAES_ASM_EXTENDED_KEY
2256 add r4, sp, #0x90 @ pass key schedule
2257#else
2258 add r4, $key, #248 @ pass key schedule
2259#endif
2260 veor @XMM[4], @XMM[4], @XMM[12]
2261 mov r5, $rounds @ pass rounds
2262 mov r0, sp
2263
2264 bl _bsaes_decrypt8
2265
2266 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2267 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2268 veor @XMM[0], @XMM[0], @XMM[ 8]
2269 vld1.64 {@XMM[12]}, [r0,:128]!
2270 veor @XMM[1], @XMM[1], @XMM[ 9]
2271 veor @XMM[8], @XMM[6], @XMM[10]
2272 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2273 veor @XMM[9], @XMM[4], @XMM[11]
2274 veor @XMM[10], @XMM[2], @XMM[12]
2275 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2276 vst1.8 {@XMM[10]}, [$out]!
2277
2278 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2279 b .Lxts_dec_done
2280.align 4
2281.Lxts_dec_4:
2282 vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
2283
2284 veor @XMM[2], @XMM[2], @XMM[10]
2285#ifndef BSAES_ASM_EXTENDED_KEY
2286 add r4, sp, #0x90 @ pass key schedule
2287#else
2288 add r4, $key, #248 @ pass key schedule
2289#endif
2290 veor @XMM[3], @XMM[3], @XMM[11]
2291 mov r5, $rounds @ pass rounds
2292 mov r0, sp
2293
2294 bl _bsaes_decrypt8
2295
2296 vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
2297 vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
2298 veor @XMM[0], @XMM[0], @XMM[ 8]
2299 veor @XMM[1], @XMM[1], @XMM[ 9]
2300 veor @XMM[8], @XMM[6], @XMM[10]
2301 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2302 veor @XMM[9], @XMM[4], @XMM[11]
2303 vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
2304
2305 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2306 b .Lxts_dec_done
2307.align 4
2308.Lxts_dec_3:
2309 vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
2310
2311 veor @XMM[1], @XMM[1], @XMM[9]
2312#ifndef BSAES_ASM_EXTENDED_KEY
2313 add r4, sp, #0x90 @ pass key schedule
2314#else
2315 add r4, $key, #248 @ pass key schedule
2316#endif
2317 veor @XMM[2], @XMM[2], @XMM[10]
2318 mov r5, $rounds @ pass rounds
2319 mov r0, sp
2320
2321 bl _bsaes_decrypt8
2322
2323 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
2324 vld1.64 {@XMM[10]}, [r0,:128]!
2325 veor @XMM[0], @XMM[0], @XMM[ 8]
2326 veor @XMM[1], @XMM[1], @XMM[ 9]
2327 veor @XMM[8], @XMM[6], @XMM[10]
2328 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2329 vst1.8 {@XMM[8]}, [$out]!
2330
2331 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2332 b .Lxts_dec_done
2333.align 4
2334.Lxts_dec_2:
2335 vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
2336
2337 veor @XMM[0], @XMM[0], @XMM[8]
2338#ifndef BSAES_ASM_EXTENDED_KEY
2339 add r4, sp, #0x90 @ pass key schedule
2340#else
2341 add r4, $key, #248 @ pass key schedule
2342#endif
2343 veor @XMM[1], @XMM[1], @XMM[9]
2344 mov r5, $rounds @ pass rounds
2345 mov r0, sp
2346
2347 bl _bsaes_decrypt8
2348
2349 vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
2350 veor @XMM[0], @XMM[0], @XMM[ 8]
2351 veor @XMM[1], @XMM[1], @XMM[ 9]
2352 vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
2353
2354 vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
2355 b .Lxts_dec_done
2356.align 4
2357.Lxts_dec_1:
2358 mov r0, sp
2359 veor @XMM[0], @XMM[8]
2360 mov r1, sp
2361 vst1.8 {@XMM[0]}, [sp,:128]
2362 mov r2, $key
2363 mov r4, $fp @ preserve fp
2364 mov r5, $magic @ preserve magic
2365
2366 bl AES_decrypt
2367
2368 vld1.8 {@XMM[0]}, [sp,:128]
2369 veor @XMM[0], @XMM[0], @XMM[8]
2370 vst1.8 {@XMM[0]}, [$out]!
2371 mov $fp, r4
2372 mov $magic, r5
2373
2374 vmov @XMM[8], @XMM[9] @ next round tweak
2375
2376.Lxts_dec_done:
2377#ifndef XTS_CHAIN_TWEAK
2378 adds $len, #0x10
2379 beq .Lxts_dec_ret
2380
2381 @ calculate one round of extra tweak for the stolen ciphertext
2382 vldmia $magic, {$twmask}
2383 vshr.s64 @XMM[6], @XMM[8], #63
2384 vand @XMM[6], @XMM[6], $twmask
2385 vadd.u64 @XMM[9], @XMM[8], @XMM[8]
2386 vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
2387 veor @XMM[9], @XMM[9], @XMM[6]
2388
2389 @ perform the final decryption with the last tweak value
2390 vld1.8 {@XMM[0]}, [$inp]!
2391 mov r0, sp
2392 veor @XMM[0], @XMM[0], @XMM[9]
2393 mov r1, sp
2394 vst1.8 {@XMM[0]}, [sp,:128]
2395 mov r2, $key
2396 mov r4, $fp @ preserve fp
2397
2398 bl AES_decrypt
2399
2400 vld1.8 {@XMM[0]}, [sp,:128]
2401 veor @XMM[0], @XMM[0], @XMM[9]
2402 vst1.8 {@XMM[0]}, [$out]
2403
2404 mov r6, $out
2405.Lxts_dec_steal:
2406 ldrb r1, [$out]
2407 ldrb r0, [$inp], #1
2408 strb r1, [$out, #0x10]
2409 strb r0, [$out], #1
2410
2411 subs $len, #1
2412 bhi .Lxts_dec_steal
2413
2414 vld1.8 {@XMM[0]}, [r6]
2415 mov r0, sp
2416 veor @XMM[0], @XMM[8]
2417 mov r1, sp
2418 vst1.8 {@XMM[0]}, [sp,:128]
2419 mov r2, $key
2420
2421 bl AES_decrypt
2422
2423 vld1.8 {@XMM[0]}, [sp,:128]
2424 veor @XMM[0], @XMM[0], @XMM[8]
2425 vst1.8 {@XMM[0]}, [r6]
2426 mov $fp, r4
2427#endif
2428
2429.Lxts_dec_ret:
2430 bic r0, $fp, #0xf
2431 vmov.i32 q0, #0
2432 vmov.i32 q1, #0
2433#ifdef XTS_CHAIN_TWEAK
2434 ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
2435#endif
2436.Lxts_dec_bzero: @ wipe key schedule [if any]
2437 vstmia sp!, {q0-q1}
2438 cmp sp, r0
2439 bne .Lxts_dec_bzero
2440
2441 mov sp, $fp
2442#ifdef XTS_CHAIN_TWEAK
2443 vst1.8 {@XMM[8]}, [r1]
2444#endif
2445 VFP_ABI_POP
2446 ldmia sp!, {r4-r10, pc} @ return
2447
2448.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2449___
2450}
2451$code.=<<___;
2452#endif
2453___
2454
2455$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2456
2457open SELF,$0;
2458while(<SELF>) {
2459 next if (/^#!/);
2460 last if (!s/^#/@/ and !/^$/);
2461 print;
2462}
2463close SELF;
2464
2465print $code;
2466
2467close STDOUT;
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 1a7024b41351..c38b58c80202 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += sembuf.h
24generic-y += serial.h 24generic-y += serial.h
25generic-y += shmbuf.h 25generic-y += shmbuf.h
26generic-y += siginfo.h 26generic-y += siginfo.h
27generic-y += simd.h
27generic-y += sizes.h 28generic-y += sizes.h
28generic-y += socket.h 29generic-y += socket.h
29generic-y += sockios.h 30generic-y += sockios.h
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index fcc1b5bf6979..5c2285160575 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -53,6 +53,13 @@
53#define put_byte_3 lsl #0 53#define put_byte_3 lsl #0
54#endif 54#endif
55 55
56/* Select code for any configuration running in BE8 mode */
57#ifdef CONFIG_CPU_ENDIAN_BE8
58#define ARM_BE8(code...) code
59#else
60#define ARM_BE8(code...)
61#endif
62
56/* 63/*
57 * Data preload for architectures that support it 64 * Data preload for architectures that support it
58 */ 65 */
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index da1c77d39327..62d2cb53b069 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -12,6 +12,7 @@
12#define __ASM_ARM_ATOMIC_H 12#define __ASM_ARM_ATOMIC_H
13 13
14#include <linux/compiler.h> 14#include <linux/compiler.h>
15#include <linux/prefetch.h>
15#include <linux/types.h> 16#include <linux/types.h>
16#include <linux/irqflags.h> 17#include <linux/irqflags.h>
17#include <asm/barrier.h> 18#include <asm/barrier.h>
@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v)
41 unsigned long tmp; 42 unsigned long tmp;
42 int result; 43 int result;
43 44
45 prefetchw(&v->counter);
44 __asm__ __volatile__("@ atomic_add\n" 46 __asm__ __volatile__("@ atomic_add\n"
45"1: ldrex %0, [%3]\n" 47"1: ldrex %0, [%3]\n"
46" add %0, %0, %4\n" 48" add %0, %0, %4\n"
@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v)
79 unsigned long tmp; 81 unsigned long tmp;
80 int result; 82 int result;
81 83
84 prefetchw(&v->counter);
82 __asm__ __volatile__("@ atomic_sub\n" 85 __asm__ __volatile__("@ atomic_sub\n"
83"1: ldrex %0, [%3]\n" 86"1: ldrex %0, [%3]\n"
84" sub %0, %0, %4\n" 87" sub %0, %0, %4\n"
@@ -114,7 +117,8 @@ static inline int atomic_sub_return(int i, atomic_t *v)
114 117
115static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) 118static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
116{ 119{
117 unsigned long oldval, res; 120 int oldval;
121 unsigned long res;
118 122
119 smp_mb(); 123 smp_mb();
120 124
@@ -134,21 +138,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
134 return oldval; 138 return oldval;
135} 139}
136 140
137static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
138{
139 unsigned long tmp, tmp2;
140
141 __asm__ __volatile__("@ atomic_clear_mask\n"
142"1: ldrex %0, [%3]\n"
143" bic %0, %0, %4\n"
144" strex %1, %0, [%3]\n"
145" teq %1, #0\n"
146" bne 1b"
147 : "=&r" (tmp), "=&r" (tmp2), "+Qo" (*addr)
148 : "r" (addr), "Ir" (mask)
149 : "cc");
150}
151
152#else /* ARM_ARCH_6 */ 141#else /* ARM_ARCH_6 */
153 142
154#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
@@ -197,15 +186,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
197 return ret; 186 return ret;
198} 187}
199 188
200static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
201{
202 unsigned long flags;
203
204 raw_local_irq_save(flags);
205 *addr &= ~mask;
206 raw_local_irq_restore(flags);
207}
208
209#endif /* __LINUX_ARM_ARCH__ */ 189#endif /* __LINUX_ARM_ARCH__ */
210 190
211#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) 191#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
@@ -238,15 +218,15 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
238 218
239#ifndef CONFIG_GENERIC_ATOMIC64 219#ifndef CONFIG_GENERIC_ATOMIC64
240typedef struct { 220typedef struct {
241 u64 __aligned(8) counter; 221 long long counter;
242} atomic64_t; 222} atomic64_t;
243 223
244#define ATOMIC64_INIT(i) { (i) } 224#define ATOMIC64_INIT(i) { (i) }
245 225
246#ifdef CONFIG_ARM_LPAE 226#ifdef CONFIG_ARM_LPAE
247static inline u64 atomic64_read(const atomic64_t *v) 227static inline long long atomic64_read(const atomic64_t *v)
248{ 228{
249 u64 result; 229 long long result;
250 230
251 __asm__ __volatile__("@ atomic64_read\n" 231 __asm__ __volatile__("@ atomic64_read\n"
252" ldrd %0, %H0, [%1]" 232" ldrd %0, %H0, [%1]"
@@ -257,7 +237,7 @@ static inline u64 atomic64_read(const atomic64_t *v)
257 return result; 237 return result;
258} 238}
259 239
260static inline void atomic64_set(atomic64_t *v, u64 i) 240static inline void atomic64_set(atomic64_t *v, long long i)
261{ 241{
262 __asm__ __volatile__("@ atomic64_set\n" 242 __asm__ __volatile__("@ atomic64_set\n"
263" strd %2, %H2, [%1]" 243" strd %2, %H2, [%1]"
@@ -266,9 +246,9 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
266 ); 246 );
267} 247}
268#else 248#else
269static inline u64 atomic64_read(const atomic64_t *v) 249static inline long long atomic64_read(const atomic64_t *v)
270{ 250{
271 u64 result; 251 long long result;
272 252
273 __asm__ __volatile__("@ atomic64_read\n" 253 __asm__ __volatile__("@ atomic64_read\n"
274" ldrexd %0, %H0, [%1]" 254" ldrexd %0, %H0, [%1]"
@@ -279,10 +259,11 @@ static inline u64 atomic64_read(const atomic64_t *v)
279 return result; 259 return result;
280} 260}
281 261
282static inline void atomic64_set(atomic64_t *v, u64 i) 262static inline void atomic64_set(atomic64_t *v, long long i)
283{ 263{
284 u64 tmp; 264 long long tmp;
285 265
266 prefetchw(&v->counter);
286 __asm__ __volatile__("@ atomic64_set\n" 267 __asm__ __volatile__("@ atomic64_set\n"
287"1: ldrexd %0, %H0, [%2]\n" 268"1: ldrexd %0, %H0, [%2]\n"
288" strexd %0, %3, %H3, [%2]\n" 269" strexd %0, %3, %H3, [%2]\n"
@@ -294,15 +275,16 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
294} 275}
295#endif 276#endif
296 277
297static inline void atomic64_add(u64 i, atomic64_t *v) 278static inline void atomic64_add(long long i, atomic64_t *v)
298{ 279{
299 u64 result; 280 long long result;
300 unsigned long tmp; 281 unsigned long tmp;
301 282
283 prefetchw(&v->counter);
302 __asm__ __volatile__("@ atomic64_add\n" 284 __asm__ __volatile__("@ atomic64_add\n"
303"1: ldrexd %0, %H0, [%3]\n" 285"1: ldrexd %0, %H0, [%3]\n"
304" adds %0, %0, %4\n" 286" adds %Q0, %Q0, %Q4\n"
305" adc %H0, %H0, %H4\n" 287" adc %R0, %R0, %R4\n"
306" strexd %1, %0, %H0, [%3]\n" 288" strexd %1, %0, %H0, [%3]\n"
307" teq %1, #0\n" 289" teq %1, #0\n"
308" bne 1b" 290" bne 1b"
@@ -311,17 +293,17 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
311 : "cc"); 293 : "cc");
312} 294}
313 295
314static inline u64 atomic64_add_return(u64 i, atomic64_t *v) 296static inline long long atomic64_add_return(long long i, atomic64_t *v)
315{ 297{
316 u64 result; 298 long long result;
317 unsigned long tmp; 299 unsigned long tmp;
318 300
319 smp_mb(); 301 smp_mb();
320 302
321 __asm__ __volatile__("@ atomic64_add_return\n" 303 __asm__ __volatile__("@ atomic64_add_return\n"
322"1: ldrexd %0, %H0, [%3]\n" 304"1: ldrexd %0, %H0, [%3]\n"
323" adds %0, %0, %4\n" 305" adds %Q0, %Q0, %Q4\n"
324" adc %H0, %H0, %H4\n" 306" adc %R0, %R0, %R4\n"
325" strexd %1, %0, %H0, [%3]\n" 307" strexd %1, %0, %H0, [%3]\n"
326" teq %1, #0\n" 308" teq %1, #0\n"
327" bne 1b" 309" bne 1b"
@@ -334,15 +316,16 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
334 return result; 316 return result;
335} 317}
336 318
337static inline void atomic64_sub(u64 i, atomic64_t *v) 319static inline void atomic64_sub(long long i, atomic64_t *v)
338{ 320{
339 u64 result; 321 long long result;
340 unsigned long tmp; 322 unsigned long tmp;
341 323
324 prefetchw(&v->counter);
342 __asm__ __volatile__("@ atomic64_sub\n" 325 __asm__ __volatile__("@ atomic64_sub\n"
343"1: ldrexd %0, %H0, [%3]\n" 326"1: ldrexd %0, %H0, [%3]\n"
344" subs %0, %0, %4\n" 327" subs %Q0, %Q0, %Q4\n"
345" sbc %H0, %H0, %H4\n" 328" sbc %R0, %R0, %R4\n"
346" strexd %1, %0, %H0, [%3]\n" 329" strexd %1, %0, %H0, [%3]\n"
347" teq %1, #0\n" 330" teq %1, #0\n"
348" bne 1b" 331" bne 1b"
@@ -351,17 +334,17 @@ static inline void atomic64_sub(u64 i, atomic64_t *v)
351 : "cc"); 334 : "cc");
352} 335}
353 336
354static inline u64 atomic64_sub_return(u64 i, atomic64_t *v) 337static inline long long atomic64_sub_return(long long i, atomic64_t *v)
355{ 338{
356 u64 result; 339 long long result;
357 unsigned long tmp; 340 unsigned long tmp;
358 341
359 smp_mb(); 342 smp_mb();
360 343
361 __asm__ __volatile__("@ atomic64_sub_return\n" 344 __asm__ __volatile__("@ atomic64_sub_return\n"
362"1: ldrexd %0, %H0, [%3]\n" 345"1: ldrexd %0, %H0, [%3]\n"
363" subs %0, %0, %4\n" 346" subs %Q0, %Q0, %Q4\n"
364" sbc %H0, %H0, %H4\n" 347" sbc %R0, %R0, %R4\n"
365" strexd %1, %0, %H0, [%3]\n" 348" strexd %1, %0, %H0, [%3]\n"
366" teq %1, #0\n" 349" teq %1, #0\n"
367" bne 1b" 350" bne 1b"
@@ -374,9 +357,10 @@ static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
374 return result; 357 return result;
375} 358}
376 359
377static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new) 360static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
361 long long new)
378{ 362{
379 u64 oldval; 363 long long oldval;
380 unsigned long res; 364 unsigned long res;
381 365
382 smp_mb(); 366 smp_mb();
@@ -398,9 +382,9 @@ static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
398 return oldval; 382 return oldval;
399} 383}
400 384
401static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new) 385static inline long long atomic64_xchg(atomic64_t *ptr, long long new)
402{ 386{
403 u64 result; 387 long long result;
404 unsigned long tmp; 388 unsigned long tmp;
405 389
406 smp_mb(); 390 smp_mb();
@@ -419,18 +403,18 @@ static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
419 return result; 403 return result;
420} 404}
421 405
422static inline u64 atomic64_dec_if_positive(atomic64_t *v) 406static inline long long atomic64_dec_if_positive(atomic64_t *v)
423{ 407{
424 u64 result; 408 long long result;
425 unsigned long tmp; 409 unsigned long tmp;
426 410
427 smp_mb(); 411 smp_mb();
428 412
429 __asm__ __volatile__("@ atomic64_dec_if_positive\n" 413 __asm__ __volatile__("@ atomic64_dec_if_positive\n"
430"1: ldrexd %0, %H0, [%3]\n" 414"1: ldrexd %0, %H0, [%3]\n"
431" subs %0, %0, #1\n" 415" subs %Q0, %Q0, #1\n"
432" sbc %H0, %H0, #0\n" 416" sbc %R0, %R0, #0\n"
433" teq %H0, #0\n" 417" teq %R0, #0\n"
434" bmi 2f\n" 418" bmi 2f\n"
435" strexd %1, %0, %H0, [%3]\n" 419" strexd %1, %0, %H0, [%3]\n"
436" teq %1, #0\n" 420" teq %1, #0\n"
@@ -445,9 +429,9 @@ static inline u64 atomic64_dec_if_positive(atomic64_t *v)
445 return result; 429 return result;
446} 430}
447 431
448static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u) 432static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
449{ 433{
450 u64 val; 434 long long val;
451 unsigned long tmp; 435 unsigned long tmp;
452 int ret = 1; 436 int ret = 1;
453 437
@@ -459,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
459" teqeq %H0, %H5\n" 443" teqeq %H0, %H5\n"
460" moveq %1, #0\n" 444" moveq %1, #0\n"
461" beq 2f\n" 445" beq 2f\n"
462" adds %0, %0, %6\n" 446" adds %Q0, %Q0, %Q6\n"
463" adc %H0, %H0, %H6\n" 447" adc %R0, %R0, %R6\n"
464" strexd %2, %0, %H0, [%4]\n" 448" strexd %2, %0, %H0, [%4]\n"
465" teq %2, #0\n" 449" teq %2, #0\n"
466" bne 1b\n" 450" bne 1b\n"
diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h
new file mode 100644
index 000000000000..1714800fa113
--- /dev/null
+++ b/arch/arm/include/asm/bL_switcher.h
@@ -0,0 +1,77 @@
1/*
2 * arch/arm/include/asm/bL_switcher.h
3 *
4 * Created by: Nicolas Pitre, April 2012
5 * Copyright: (C) 2012-2013 Linaro Limited
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#ifndef ASM_BL_SWITCHER_H
13#define ASM_BL_SWITCHER_H
14
15#include <linux/compiler.h>
16#include <linux/types.h>
17
18typedef void (*bL_switch_completion_handler)(void *cookie);
19
20int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
21 bL_switch_completion_handler completer,
22 void *completer_cookie);
23static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
24{
25 return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
26}
27
28/*
29 * Register here to be notified about runtime enabling/disabling of
30 * the switcher.
31 *
32 * The notifier chain is called with the switcher activation lock held:
33 * the switcher will not be enabled or disabled during callbacks.
34 * Callbacks must not call bL_switcher_{get,put}_enabled().
35 */
36#define BL_NOTIFY_PRE_ENABLE 0
37#define BL_NOTIFY_POST_ENABLE 1
38#define BL_NOTIFY_PRE_DISABLE 2
39#define BL_NOTIFY_POST_DISABLE 3
40
41#ifdef CONFIG_BL_SWITCHER
42
43int bL_switcher_register_notifier(struct notifier_block *nb);
44int bL_switcher_unregister_notifier(struct notifier_block *nb);
45
46/*
47 * Use these functions to temporarily prevent enabling/disabling of
48 * the switcher.
49 * bL_switcher_get_enabled() returns true if the switcher is currently
50 * enabled. Each call to bL_switcher_get_enabled() must be followed
51 * by a call to bL_switcher_put_enabled(). These functions are not
52 * recursive.
53 */
54bool bL_switcher_get_enabled(void);
55void bL_switcher_put_enabled(void);
56
57int bL_switcher_trace_trigger(void);
58int bL_switcher_get_logical_index(u32 mpidr);
59
60#else
61static inline int bL_switcher_register_notifier(struct notifier_block *nb)
62{
63 return 0;
64}
65
66static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
67{
68 return 0;
69}
70
71static inline bool bL_switcher_get_enabled(void) { return false; }
72static inline void bL_switcher_put_enabled(void) { }
73static inline int bL_switcher_trace_trigger(void) { return 0; }
74static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
75#endif /* CONFIG_BL_SWITCHER */
76
77#endif
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h
index 7af5c6c3653a..b274bde24905 100644
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -2,6 +2,8 @@
2#define _ASMARM_BUG_H 2#define _ASMARM_BUG_H
3 3
4#include <linux/linkage.h> 4#include <linux/linkage.h>
5#include <linux/types.h>
6#include <asm/opcodes.h>
5 7
6#ifdef CONFIG_BUG 8#ifdef CONFIG_BUG
7 9
@@ -12,10 +14,10 @@
12 */ 14 */
13#ifdef CONFIG_THUMB2_KERNEL 15#ifdef CONFIG_THUMB2_KERNEL
14#define BUG_INSTR_VALUE 0xde02 16#define BUG_INSTR_VALUE 0xde02
15#define BUG_INSTR_TYPE ".hword " 17#define BUG_INSTR(__value) __inst_thumb16(__value)
16#else 18#else
17#define BUG_INSTR_VALUE 0xe7f001f2 19#define BUG_INSTR_VALUE 0xe7f001f2
18#define BUG_INSTR_TYPE ".word " 20#define BUG_INSTR(__value) __inst_arm(__value)
19#endif 21#endif
20 22
21 23
@@ -33,7 +35,7 @@
33 35
34#define __BUG(__file, __line, __value) \ 36#define __BUG(__file, __line, __value) \
35do { \ 37do { \
36 asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n" \ 38 asm volatile("1:\t" BUG_INSTR(__value) "\n" \
37 ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \ 39 ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
38 "2:\t.asciz " #__file "\n" \ 40 "2:\t.asciz " #__file "\n" \
39 ".popsection\n" \ 41 ".popsection\n" \
@@ -48,7 +50,7 @@ do { \
48 50
49#define __BUG(__file, __line, __value) \ 51#define __BUG(__file, __line, __value) \
50do { \ 52do { \
51 asm volatile(BUG_INSTR_TYPE #__value); \ 53 asm volatile(BUG_INSTR(__value) "\n"); \
52 unreachable(); \ 54 unreachable(); \
53} while (0) 55} while (0)
54#endif /* CONFIG_DEBUG_BUGVERBOSE */ 56#endif /* CONFIG_DEBUG_BUGVERBOSE */
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 15f2d5bf8875..ee753f1749cd 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -435,4 +435,50 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
435#define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr)) 435#define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr))
436#define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr)) 436#define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr))
437 437
438/*
439 * Disabling cache access for one CPU in an ARMv7 SMP system is tricky.
440 * To do so we must:
441 *
442 * - Clear the SCTLR.C bit to prevent further cache allocations
443 * - Flush the desired level of cache
444 * - Clear the ACTLR "SMP" bit to disable local coherency
445 *
446 * ... and so without any intervening memory access in between those steps,
447 * not even to the stack.
448 *
449 * WARNING -- After this has been called:
450 *
451 * - No ldrex/strex (and similar) instructions must be used.
452 * - The CPU is obviously no longer coherent with the other CPUs.
453 * - This is unlikely to work as expected if Linux is running non-secure.
454 *
455 * Note:
456 *
457 * - This is known to apply to several ARMv7 processor implementations,
458 * however some exceptions may exist. Caveat emptor.
459 *
460 * - The clobber list is dictated by the call to v7_flush_dcache_*.
461 * fp is preserved to the stack explicitly prior disabling the cache
462 * since adding it to the clobber list is incompatible with having
463 * CONFIG_FRAME_POINTER=y. ip is saved as well if ever r12-clobbering
464 * trampoline are inserted by the linker and to keep sp 64-bit aligned.
465 */
466#define v7_exit_coherency_flush(level) \
467 asm volatile( \
468 "stmfd sp!, {fp, ip} \n\t" \
469 "mrc p15, 0, r0, c1, c0, 0 @ get SCTLR \n\t" \
470 "bic r0, r0, #"__stringify(CR_C)" \n\t" \
471 "mcr p15, 0, r0, c1, c0, 0 @ set SCTLR \n\t" \
472 "isb \n\t" \
473 "bl v7_flush_dcache_"__stringify(level)" \n\t" \
474 "clrex \n\t" \
475 "mrc p15, 0, r0, c1, c0, 1 @ get ACTLR \n\t" \
476 "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" \
477 "mcr p15, 0, r0, c1, c0, 1 @ set ACTLR \n\t" \
478 "isb \n\t" \
479 "dsb \n\t" \
480 "ldmfd sp!, {fp, ip}" \
481 : : : "r0","r1","r2","r3","r4","r5","r6","r7", \
482 "r9","r10","lr","memory" )
483
438#endif 484#endif
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index 4f009c10540d..df2fbba7efc8 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -223,6 +223,42 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
223 return ret; 223 return ret;
224} 224}
225 225
226static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
227 unsigned long long old,
228 unsigned long long new)
229{
230 unsigned long long oldval;
231 unsigned long res;
232
233 __asm__ __volatile__(
234"1: ldrexd %1, %H1, [%3]\n"
235" teq %1, %4\n"
236" teqeq %H1, %H4\n"
237" bne 2f\n"
238" strexd %0, %5, %H5, [%3]\n"
239" teq %0, #0\n"
240" bne 1b\n"
241"2:"
242 : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr)
243 : "r" (ptr), "r" (old), "r" (new)
244 : "cc");
245
246 return oldval;
247}
248
249static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
250 unsigned long long old,
251 unsigned long long new)
252{
253 unsigned long long ret;
254
255 smp_mb();
256 ret = __cmpxchg64(ptr, old, new);
257 smp_mb();
258
259 return ret;
260}
261
226#define cmpxchg_local(ptr,o,n) \ 262#define cmpxchg_local(ptr,o,n) \
227 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), \ 263 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), \
228 (unsigned long)(o), \ 264 (unsigned long)(o), \
@@ -230,18 +266,16 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
230 sizeof(*(ptr)))) 266 sizeof(*(ptr))))
231 267
232#define cmpxchg64(ptr, o, n) \ 268#define cmpxchg64(ptr, o, n) \
233 ((__typeof__(*(ptr)))atomic64_cmpxchg(container_of((ptr), \ 269 ((__typeof__(*(ptr)))__cmpxchg64_mb((ptr), \
234 atomic64_t, \ 270 (unsigned long long)(o), \
235 counter), \ 271 (unsigned long long)(n)))
236 (unsigned long long)(o), \ 272
237 (unsigned long long)(n))) 273#define cmpxchg64_relaxed(ptr, o, n) \
238 274 ((__typeof__(*(ptr)))__cmpxchg64((ptr), \
239#define cmpxchg64_local(ptr, o, n) \ 275 (unsigned long long)(o), \
240 ((__typeof__(*(ptr)))local64_cmpxchg(container_of((ptr), \ 276 (unsigned long long)(n)))
241 local64_t, \ 277
242 a), \ 278#define cmpxchg64_local(ptr, o, n) cmpxchg64_relaxed((ptr), (o), (n))
243 (unsigned long long)(o), \
244 (unsigned long long)(n)))
245 279
246#endif /* __LINUX_ARM_ARCH__ >= 6 */ 280#endif /* __LINUX_ARM_ARCH__ >= 6 */
247 281
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
index 9672e978d50d..acdde76b39bb 100644
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -10,6 +10,7 @@
10#define CPUID_TLBTYPE 3 10#define CPUID_TLBTYPE 3
11#define CPUID_MPUIR 4 11#define CPUID_MPUIR 4
12#define CPUID_MPIDR 5 12#define CPUID_MPIDR 5
13#define CPUID_REVIDR 6
13 14
14#ifdef CONFIG_CPU_V7M 15#ifdef CONFIG_CPU_V7M
15#define CPUID_EXT_PFR0 0x40 16#define CPUID_EXT_PFR0 0x40
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index 2740c2a2df63..fe3ea776dc34 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <asm/irq.h> 6#include <asm/irq.h>
7 7
8#define NR_IPI 6 8#define NR_IPI 8
9 9
10typedef struct { 10typedef struct {
11 unsigned int __softirq_pending; 11 unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/hardware/coresight.h b/arch/arm/include/asm/hardware/coresight.h
index 0cf7a6b842ff..ad774f37c47c 100644
--- a/arch/arm/include/asm/hardware/coresight.h
+++ b/arch/arm/include/asm/hardware/coresight.h
@@ -24,8 +24,8 @@
24#define TRACER_TIMEOUT 10000 24#define TRACER_TIMEOUT 10000
25 25
26#define etm_writel(t, v, x) \ 26#define etm_writel(t, v, x) \
27 (__raw_writel((v), (t)->etm_regs + (x))) 27 (writel_relaxed((v), (t)->etm_regs + (x)))
28#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x))) 28#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
29 29
30/* CoreSight Management Registers */ 30/* CoreSight Management Registers */
31#define CSMR_LOCKACCESS 0xfb0 31#define CSMR_LOCKACCESS 0xfb0
@@ -142,8 +142,8 @@
142#define ETBFF_TRIGFL BIT(10) 142#define ETBFF_TRIGFL BIT(10)
143 143
144#define etb_writel(t, v, x) \ 144#define etb_writel(t, v, x) \
145 (__raw_writel((v), (t)->etb_regs + (x))) 145 (writel_relaxed((v), (t)->etb_regs + (x)))
146#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x))) 146#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
147 147
148#define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0) 148#define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
149#define etm_unlock(t) \ 149#define etm_unlock(t) \
diff --git a/arch/arm/include/asm/kgdb.h b/arch/arm/include/asm/kgdb.h
index 48066ce9ea34..0a9d5dd93294 100644
--- a/arch/arm/include/asm/kgdb.h
+++ b/arch/arm/include/asm/kgdb.h
@@ -11,6 +11,7 @@
11#define __ARM_KGDB_H__ 11#define __ARM_KGDB_H__
12 12
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <asm/opcodes.h>
14 15
15/* 16/*
16 * GDB assumes that we're a user process being debugged, so 17 * GDB assumes that we're a user process being debugged, so
@@ -41,7 +42,7 @@
41 42
42static inline void arch_kgdb_breakpoint(void) 43static inline void arch_kgdb_breakpoint(void)
43{ 44{
44 asm(".word 0xe7ffdeff"); 45 asm(__inst_arm(0xe7ffdeff));
45} 46}
46 47
47extern void kgdb_handle_bus_error(void); 48extern void kgdb_handle_bus_error(void);
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index 402a2bc6aa68..17a3fa2979e8 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -49,6 +49,7 @@ struct machine_desc {
49 bool (*smp_init)(void); 49 bool (*smp_init)(void);
50 void (*fixup)(struct tag *, char **, 50 void (*fixup)(struct tag *, char **,
51 struct meminfo *); 51 struct meminfo *);
52 void (*init_meminfo)(void);
52 void (*reserve)(void);/* reserve mem blocks */ 53 void (*reserve)(void);/* reserve mem blocks */
53 void (*map_io)(void);/* IO mapping function */ 54 void (*map_io)(void);/* IO mapping function */
54 void (*init_early)(void); 55 void (*init_early)(void);
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h
index fc82a88f5b69..608516ebabfe 100644
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -42,6 +42,14 @@ extern void mcpm_entry_point(void);
42void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); 42void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
43 43
44/* 44/*
45 * This sets an early poke i.e a value to be poked into some address
46 * from very early assembly code before the CPU is ungated. The
47 * address must be physical, and if 0 then nothing will happen.
48 */
49void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
50 unsigned long poke_phys_addr, unsigned long poke_val);
51
52/*
45 * CPU/cluster power operations API for higher subsystems to use. 53 * CPU/cluster power operations API for higher subsystems to use.
46 */ 54 */
47 55
@@ -81,10 +89,40 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster);
81 * 89 *
82 * This will return if mcpm_platform_register() has not been called 90 * This will return if mcpm_platform_register() has not been called
83 * previously in which case the caller should take appropriate action. 91 * previously in which case the caller should take appropriate action.
92 *
93 * On success, the CPU is not guaranteed to be truly halted until
94 * mcpm_cpu_power_down_finish() subsequently returns non-zero for the
95 * specified cpu. Until then, other CPUs should make sure they do not
96 * trash memory the target CPU might be executing/accessing.
84 */ 97 */
85void mcpm_cpu_power_down(void); 98void mcpm_cpu_power_down(void);
86 99
87/** 100/**
101 * mcpm_cpu_power_down_finish - wait for a specified CPU to halt, and
102 * make sure it is powered off
103 *
104 * @cpu: CPU number within given cluster
105 * @cluster: cluster number for the CPU
106 *
107 * Call this function to ensure that a pending powerdown has taken
108 * effect and the CPU is safely parked before performing non-mcpm
109 * operations that may affect the CPU (such as kexec trashing the
110 * kernel text).
111 *
112 * It is *not* necessary to call this function if you only need to
113 * serialise a pending powerdown with mcpm_cpu_power_up() or a wakeup
114 * event.
115 *
116 * Do not call this function unless the specified CPU has already
117 * called mcpm_cpu_power_down() or has committed to doing so.
118 *
119 * @return:
120 * - zero if the CPU is in a safely parked state
121 * - nonzero otherwise (e.g., timeout)
122 */
123int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster);
124
125/**
88 * mcpm_cpu_suspend - bring the calling CPU in a suspended state 126 * mcpm_cpu_suspend - bring the calling CPU in a suspended state
89 * 127 *
90 * @expected_residency: duration in microseconds the CPU is expected 128 * @expected_residency: duration in microseconds the CPU is expected
@@ -126,6 +164,7 @@ int mcpm_cpu_powered_up(void);
126struct mcpm_platform_ops { 164struct mcpm_platform_ops {
127 int (*power_up)(unsigned int cpu, unsigned int cluster); 165 int (*power_up)(unsigned int cpu, unsigned int cluster);
128 void (*power_down)(void); 166 void (*power_down)(void);
167 int (*power_down_finish)(unsigned int cpu, unsigned int cluster);
129 void (*suspend)(u64); 168 void (*suspend)(u64);
130 void (*powered_up)(void); 169 void (*powered_up)(void);
131}; 170};
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index e750a938fd3c..4dd21457ef9d 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -172,8 +172,13 @@
172 * so that all we need to do is modify the 8-bit constant field. 172 * so that all we need to do is modify the 8-bit constant field.
173 */ 173 */
174#define __PV_BITS_31_24 0x81000000 174#define __PV_BITS_31_24 0x81000000
175#define __PV_BITS_7_0 0x81
176
177extern u64 __pv_phys_offset;
178extern u64 __pv_offset;
179extern void fixup_pv_table(const void *, unsigned long);
180extern const void *__pv_table_begin, *__pv_table_end;
175 181
176extern unsigned long __pv_phys_offset;
177#define PHYS_OFFSET __pv_phys_offset 182#define PHYS_OFFSET __pv_phys_offset
178 183
179#define __pv_stub(from,to,instr,type) \ 184#define __pv_stub(from,to,instr,type) \
@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset;
185 : "=r" (to) \ 190 : "=r" (to) \
186 : "r" (from), "I" (type)) 191 : "r" (from), "I" (type))
187 192
188static inline unsigned long __virt_to_phys(unsigned long x) 193#define __pv_stub_mov_hi(t) \
194 __asm__ volatile("@ __pv_stub_mov\n" \
195 "1: mov %R0, %1\n" \
196 " .pushsection .pv_table,\"a\"\n" \
197 " .long 1b\n" \
198 " .popsection\n" \
199 : "=r" (t) \
200 : "I" (__PV_BITS_7_0))
201
202#define __pv_add_carry_stub(x, y) \
203 __asm__ volatile("@ __pv_add_carry_stub\n" \
204 "1: adds %Q0, %1, %2\n" \
205 " adc %R0, %R0, #0\n" \
206 " .pushsection .pv_table,\"a\"\n" \
207 " .long 1b\n" \
208 " .popsection\n" \
209 : "+r" (y) \
210 : "r" (x), "I" (__PV_BITS_31_24) \
211 : "cc")
212
213static inline phys_addr_t __virt_to_phys(unsigned long x)
189{ 214{
190 unsigned long t; 215 phys_addr_t t;
191 __pv_stub(x, t, "add", __PV_BITS_31_24); 216
217 if (sizeof(phys_addr_t) == 4) {
218 __pv_stub(x, t, "add", __PV_BITS_31_24);
219 } else {
220 __pv_stub_mov_hi(t);
221 __pv_add_carry_stub(x, t);
222 }
192 return t; 223 return t;
193} 224}
194 225
195static inline unsigned long __phys_to_virt(unsigned long x) 226static inline unsigned long __phys_to_virt(phys_addr_t x)
196{ 227{
197 unsigned long t; 228 unsigned long t;
198 __pv_stub(x, t, "sub", __PV_BITS_31_24); 229 __pv_stub(x, t, "sub", __PV_BITS_31_24);
199 return t; 230 return t;
200} 231}
232
201#else 233#else
202#define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET) 234
203#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET) 235static inline phys_addr_t __virt_to_phys(unsigned long x)
236{
237 return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
238}
239
240static inline unsigned long __phys_to_virt(phys_addr_t x)
241{
242 return x - PHYS_OFFSET + PAGE_OFFSET;
243}
244
204#endif 245#endif
205#endif 246#endif
206#endif /* __ASSEMBLY__ */ 247#endif /* __ASSEMBLY__ */
@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x)
238 279
239static inline void *phys_to_virt(phys_addr_t x) 280static inline void *phys_to_virt(phys_addr_t x)
240{ 281{
241 return (void *)(__phys_to_virt((unsigned long)(x))); 282 return (void *)__phys_to_virt(x);
242} 283}
243 284
244/* 285/*
245 * Drivers should NOT use these either. 286 * Drivers should NOT use these either.
246 */ 287 */
247#define __pa(x) __virt_to_phys((unsigned long)(x)) 288#define __pa(x) __virt_to_phys((unsigned long)(x))
248#define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) 289#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
249#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) 290#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
250 291
292extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x);
293
294/*
295 * These are for systems that have a hardware interconnect supported alias of
296 * physical memory for idmap purposes. Most cases should leave these
297 * untouched.
298 */
299static inline phys_addr_t __virt_to_idmap(unsigned long x)
300{
301 if (arch_virt_to_idmap)
302 return arch_virt_to_idmap(x);
303 else
304 return __virt_to_phys(x);
305}
306
307#define virt_to_idmap(x) __virt_to_idmap((unsigned long)(x))
308
251/* 309/*
252 * Virtual <-> DMA view memory address translations 310 * Virtual <-> DMA view memory address translations
253 * Again, these are *only* valid on the kernel direct mapped RAM 311 * Again, these are *only* valid on the kernel direct mapped RAM
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 6f18da09668b..64fd15159b7d 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -16,7 +16,7 @@ typedef struct {
16#ifdef CONFIG_CPU_HAS_ASID 16#ifdef CONFIG_CPU_HAS_ASID
17#define ASID_BITS 8 17#define ASID_BITS 8
18#define ASID_MASK ((~0ULL) << ASID_BITS) 18#define ASID_MASK ((~0ULL) << ASID_BITS)
19#define ASID(mm) ((mm)->context.id.counter & ~ASID_MASK) 19#define ASID(mm) ((unsigned int)((mm)->context.id.counter & ~ASID_MASK))
20#else 20#else
21#define ASID(mm) (0) 21#define ASID(mm) (0)
22#endif 22#endif
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index f97ee02386ee..86a659a19526 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -181,6 +181,13 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
181 181
182#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) 182#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
183 183
184/*
185 * We don't have huge page support for short descriptors, for the moment
186 * define empty stubs for use by pin_page_for_write.
187 */
188#define pmd_hugewillfault(pmd) (0)
189#define pmd_thp_or_huge(pmd) (0)
190
184#endif /* __ASSEMBLY__ */ 191#endif /* __ASSEMBLY__ */
185 192
186#endif /* _ASM_PGTABLE_2LEVEL_H */ 193#endif /* _ASM_PGTABLE_2LEVEL_H */
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 5689c18c85f5..39c54cfa03e9 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
206#define __HAVE_ARCH_PMD_WRITE 206#define __HAVE_ARCH_PMD_WRITE
207#define pmd_write(pmd) (!(pmd_val(pmd) & PMD_SECT_RDONLY)) 207#define pmd_write(pmd) (!(pmd_val(pmd) & PMD_SECT_RDONLY))
208 208
209#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
210#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
211
209#ifdef CONFIG_TRANSPARENT_HUGEPAGE 212#ifdef CONFIG_TRANSPARENT_HUGEPAGE
210#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) 213#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
211#define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING) 214#define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 413f3876341c..c3d5fc124a05 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -22,6 +22,7 @@
22#include <asm/hw_breakpoint.h> 22#include <asm/hw_breakpoint.h>
23#include <asm/ptrace.h> 23#include <asm/ptrace.h>
24#include <asm/types.h> 24#include <asm/types.h>
25#include <asm/unified.h>
25 26
26#ifdef __KERNEL__ 27#ifdef __KERNEL__
27#define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \ 28#define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \
@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p);
87#define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc 88#define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc
88#define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp 89#define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp
89 90
91#ifdef CONFIG_SMP
92#define __ALT_SMP_ASM(smp, up) \
93 "9998: " smp "\n" \
94 " .pushsection \".alt.smp.init\", \"a\"\n" \
95 " .long 9998b\n" \
96 " " up "\n" \
97 " .popsection\n"
98#else
99#define __ALT_SMP_ASM(smp, up) up
100#endif
101
90/* 102/*
91 * Prefetching support - only ARMv5. 103 * Prefetching support - only ARMv5.
92 */ 104 */
@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr)
97{ 109{
98 __asm__ __volatile__( 110 __asm__ __volatile__(
99 "pld\t%a0" 111 "pld\t%a0"
100 : 112 :: "p" (ptr));
101 : "p" (ptr)
102 : "cc");
103} 113}
104 114
115#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
105#define ARCH_HAS_PREFETCHW 116#define ARCH_HAS_PREFETCHW
106#define prefetchw(ptr) prefetch(ptr) 117static inline void prefetchw(const void *ptr)
107 118{
108#define ARCH_HAS_SPINLOCK_PREFETCH 119 __asm__ __volatile__(
109#define spin_lock_prefetch(x) do { } while (0) 120 ".arch_extension mp\n"
110 121 __ALT_SMP_ASM(
122 WASM(pldw) "\t%a0",
123 WASM(pld) "\t%a0"
124 )
125 :: "p" (ptr));
126}
127#endif
111#endif 128#endif
112 129
113#define HAVE_ARCH_PICK_MMAP_LAYOUT 130#define HAVE_ARCH_PICK_MMAP_LAYOUT
diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h
index c50f05609501..8d6a089dfb76 100644
--- a/arch/arm/include/asm/setup.h
+++ b/arch/arm/include/asm/setup.h
@@ -49,7 +49,7 @@ extern struct meminfo meminfo;
49#define bank_phys_end(bank) ((bank)->start + (bank)->size) 49#define bank_phys_end(bank) ((bank)->start + (bank)->size)
50#define bank_phys_size(bank) (bank)->size 50#define bank_phys_size(bank) (bank)->size
51 51
52extern int arm_add_memory(phys_addr_t start, phys_addr_t size); 52extern int arm_add_memory(u64 start, u64 size);
53extern void early_print(const char *str, ...); 53extern void early_print(const char *str, ...);
54extern void dump_machine_table(void); 54extern void dump_machine_table(void);
55 55
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index a8cae71caceb..22a3b9b5d4a1 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
84extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); 84extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
85extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); 85extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
86 86
87extern int register_ipi_completion(struct completion *completion, int cpu);
88
87struct smp_operations { 89struct smp_operations {
88#ifdef CONFIG_SMP 90#ifdef CONFIG_SMP
89 /* 91 /*
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 4f2c28060c9a..ef3c6072aa45 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -5,21 +5,13 @@
5#error SMP not supported on pre-ARMv6 CPUs 5#error SMP not supported on pre-ARMv6 CPUs
6#endif 6#endif
7 7
8#include <asm/processor.h> 8#include <linux/prefetch.h>
9 9
10/* 10/*
11 * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K 11 * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K
12 * extensions, so when running on UP, we have to patch these instructions away. 12 * extensions, so when running on UP, we have to patch these instructions away.
13 */ 13 */
14#define ALT_SMP(smp, up) \
15 "9998: " smp "\n" \
16 " .pushsection \".alt.smp.init\", \"a\"\n" \
17 " .long 9998b\n" \
18 " " up "\n" \
19 " .popsection\n"
20
21#ifdef CONFIG_THUMB2_KERNEL 14#ifdef CONFIG_THUMB2_KERNEL
22#define SEV ALT_SMP("sev.w", "nop.w")
23/* 15/*
24 * For Thumb-2, special care is needed to ensure that the conditional WFE 16 * For Thumb-2, special care is needed to ensure that the conditional WFE
25 * instruction really does assemble to exactly 4 bytes (as required by 17 * instruction really does assemble to exactly 4 bytes (as required by
@@ -31,17 +23,18 @@
31 * the assembler won't change IT instructions which are explicitly present 23 * the assembler won't change IT instructions which are explicitly present
32 * in the input. 24 * in the input.
33 */ 25 */
34#define WFE(cond) ALT_SMP( \ 26#define WFE(cond) __ALT_SMP_ASM( \
35 "it " cond "\n\t" \ 27 "it " cond "\n\t" \
36 "wfe" cond ".n", \ 28 "wfe" cond ".n", \
37 \ 29 \
38 "nop.w" \ 30 "nop.w" \
39) 31)
40#else 32#else
41#define SEV ALT_SMP("sev", "nop") 33#define WFE(cond) __ALT_SMP_ASM("wfe" cond, "nop")
42#define WFE(cond) ALT_SMP("wfe" cond, "nop")
43#endif 34#endif
44 35
36#define SEV __ALT_SMP_ASM(WASM(sev), WASM(nop))
37
45static inline void dsb_sev(void) 38static inline void dsb_sev(void)
46{ 39{
47#if __LINUX_ARM_ARCH__ >= 7 40#if __LINUX_ARM_ARCH__ >= 7
@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
77 u32 newval; 70 u32 newval;
78 arch_spinlock_t lockval; 71 arch_spinlock_t lockval;
79 72
73 prefetchw(&lock->slock);
80 __asm__ __volatile__( 74 __asm__ __volatile__(
81"1: ldrex %0, [%3]\n" 75"1: ldrex %0, [%3]\n"
82" add %1, %0, %4\n" 76" add %1, %0, %4\n"
@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
100 unsigned long contended, res; 94 unsigned long contended, res;
101 u32 slock; 95 u32 slock;
102 96
97 prefetchw(&lock->slock);
103 do { 98 do {
104 __asm__ __volatile__( 99 __asm__ __volatile__(
105 " ldrex %0, [%3]\n" 100 " ldrex %0, [%3]\n"
@@ -127,10 +122,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
127 dsb_sev(); 122 dsb_sev();
128} 123}
129 124
125static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
126{
127 return lock.tickets.owner == lock.tickets.next;
128}
129
130static inline int arch_spin_is_locked(arch_spinlock_t *lock) 130static inline int arch_spin_is_locked(arch_spinlock_t *lock)
131{ 131{
132 struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets); 132 return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
133 return tickets.owner != tickets.next;
134} 133}
135 134
136static inline int arch_spin_is_contended(arch_spinlock_t *lock) 135static inline int arch_spin_is_contended(arch_spinlock_t *lock)
@@ -152,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
152{ 151{
153 unsigned long tmp; 152 unsigned long tmp;
154 153
154 prefetchw(&rw->lock);
155 __asm__ __volatile__( 155 __asm__ __volatile__(
156"1: ldrex %0, [%1]\n" 156"1: ldrex %0, [%1]\n"
157" teq %0, #0\n" 157" teq %0, #0\n"
@@ -170,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
170{ 170{
171 unsigned long contended, res; 171 unsigned long contended, res;
172 172
173 prefetchw(&rw->lock);
173 do { 174 do {
174 __asm__ __volatile__( 175 __asm__ __volatile__(
175 " ldrex %0, [%2]\n" 176 " ldrex %0, [%2]\n"
@@ -203,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
203} 204}
204 205
205/* write_can_lock - would write_trylock() succeed? */ 206/* write_can_lock - would write_trylock() succeed? */
206#define arch_write_can_lock(x) ((x)->lock == 0) 207#define arch_write_can_lock(x) (ACCESS_ONCE((x)->lock) == 0)
207 208
208/* 209/*
209 * Read locks are a bit more hairy: 210 * Read locks are a bit more hairy:
@@ -221,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
221{ 222{
222 unsigned long tmp, tmp2; 223 unsigned long tmp, tmp2;
223 224
225 prefetchw(&rw->lock);
224 __asm__ __volatile__( 226 __asm__ __volatile__(
225"1: ldrex %0, [%2]\n" 227"1: ldrex %0, [%2]\n"
226" adds %0, %0, #1\n" 228" adds %0, %0, #1\n"
@@ -241,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
241 243
242 smp_mb(); 244 smp_mb();
243 245
246 prefetchw(&rw->lock);
244 __asm__ __volatile__( 247 __asm__ __volatile__(
245"1: ldrex %0, [%2]\n" 248"1: ldrex %0, [%2]\n"
246" sub %0, %0, #1\n" 249" sub %0, %0, #1\n"
@@ -259,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
259{ 262{
260 unsigned long contended, res; 263 unsigned long contended, res;
261 264
265 prefetchw(&rw->lock);
262 do { 266 do {
263 __asm__ __volatile__( 267 __asm__ __volatile__(
264 " ldrex %0, [%2]\n" 268 " ldrex %0, [%2]\n"
@@ -280,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
280} 284}
281 285
282/* read_can_lock - would read_trylock() succeed? */ 286/* read_can_lock - would read_trylock() succeed? */
283#define arch_read_can_lock(x) ((x)->lock < 0x80000000) 287#define arch_read_can_lock(x) (ACCESS_ONCE((x)->lock) < 0x80000000)
284 288
285#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) 289#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
286#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) 290#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
index b262d2f8b478..47663fcb10ad 100644
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -25,7 +25,7 @@ typedef struct {
25#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } 25#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
26 26
27typedef struct { 27typedef struct {
28 volatile unsigned int lock; 28 u32 lock;
29} arch_rwlock_t; 29} arch_rwlock_t;
30 30
31#define __ARCH_RW_LOCK_UNLOCKED { 0 } 31#define __ARCH_RW_LOCK_UNLOCKED { 0 }
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index 38960264040c..def9e570199f 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -560,37 +560,6 @@ static inline void __flush_bp_all(void)
560 asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero)); 560 asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero));
561} 561}
562 562
563#include <asm/cputype.h>
564#ifdef CONFIG_ARM_ERRATA_798181
565static inline int erratum_a15_798181(void)
566{
567 unsigned int midr = read_cpuid_id();
568
569 /* Cortex-A15 r0p0..r3p2 affected */
570 if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2)
571 return 0;
572 return 1;
573}
574
575static inline void dummy_flush_tlb_a15_erratum(void)
576{
577 /*
578 * Dummy TLBIMVAIS. Using the unmapped address 0 and ASID 0.
579 */
580 asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
581 dsb(ish);
582}
583#else
584static inline int erratum_a15_798181(void)
585{
586 return 0;
587}
588
589static inline void dummy_flush_tlb_a15_erratum(void)
590{
591}
592#endif
593
594/* 563/*
595 * flush_pmd_entry 564 * flush_pmd_entry
596 * 565 *
@@ -697,4 +666,21 @@ extern void flush_bp_all(void);
697 666
698#endif 667#endif
699 668
669#ifndef __ASSEMBLY__
670#ifdef CONFIG_ARM_ERRATA_798181
671extern void erratum_a15_798181_init(void);
672#else
673static inline void erratum_a15_798181_init(void) {}
674#endif
675extern bool (*erratum_a15_798181_handler)(void);
676
677static inline bool erratum_a15_798181(void)
678{
679 if (unlikely(IS_ENABLED(CONFIG_ARM_ERRATA_798181) &&
680 erratum_a15_798181_handler))
681 return erratum_a15_798181_handler();
682 return false;
683}
684#endif
685
700#endif 686#endif
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h
index f5989f46b4d2..b88beaba6b4a 100644
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -38,6 +38,8 @@
38#ifdef __ASSEMBLY__ 38#ifdef __ASSEMBLY__
39#define W(instr) instr.w 39#define W(instr) instr.w
40#define BSYM(sym) sym + 1 40#define BSYM(sym) sym + 1
41#else
42#define WASM(instr) #instr ".w"
41#endif 43#endif
42 44
43#else /* !CONFIG_THUMB2_KERNEL */ 45#else /* !CONFIG_THUMB2_KERNEL */
@@ -50,6 +52,8 @@
50#ifdef __ASSEMBLY__ 52#ifdef __ASSEMBLY__
51#define W(instr) instr 53#define W(instr) instr
52#define BSYM(sym) sym 54#define BSYM(sym) sym
55#else
56#define WASM(instr) #instr
53#endif 57#endif
54 58
55#endif /* CONFIG_THUMB2_KERNEL */ 59#endif /* CONFIG_THUMB2_KERNEL */
diff --git a/arch/arm/include/debug/efm32.S b/arch/arm/include/debug/efm32.S
new file mode 100644
index 000000000000..2265a199280c
--- /dev/null
+++ b/arch/arm/include/debug/efm32.S
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) 2013 Pengutronix
3 * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#define UARTn_CMD 0x000c
11#define UARTn_CMD_TXEN 0x0004
12
13#define UARTn_STATUS 0x0010
14#define UARTn_STATUS_TXC 0x0020
15#define UARTn_STATUS_TXBL 0x0040
16
17#define UARTn_TXDATA 0x0034
18
19 .macro addruart, rx, tmp
20 ldr \rx, =(CONFIG_DEBUG_UART_PHYS)
21
22 /*
23 * enable TX. The driver might disable it to save energy. We
24 * don't care about disabling at the end as during debug power
25 * consumption isn't that important.
26 */
27 ldr \tmp, =(UARTn_CMD_TXEN)
28 str \tmp, [\rx, #UARTn_CMD]
29 .endm
30
31 .macro senduart,rd,rx
32 strb \rd, [\rx, #UARTn_TXDATA]
33 .endm
34
35 .macro waituart,rd,rx
361001: ldr \rd, [\rx, #UARTn_STATUS]
37 tst \rd, #UARTn_STATUS_TXBL
38 beq 1001b
39 .endm
40
41 .macro busyuart,rd,rx
421001: ldr \rd, [\rx, UARTn_STATUS]
43 tst \rd, #UARTn_STATUS_TXC
44 bne 1001b
45 .endm
diff --git a/arch/arm/include/debug/msm.S b/arch/arm/include/debug/msm.S
index 9166e1bc470e..9d653d475903 100644
--- a/arch/arm/include/debug/msm.S
+++ b/arch/arm/include/debug/msm.S
@@ -46,6 +46,11 @@
46#define MSM_DEBUG_UART_PHYS 0x16440000 46#define MSM_DEBUG_UART_PHYS 0x16440000
47#endif 47#endif
48 48
49#ifdef CONFIG_DEBUG_MSM8974_UART
50#define MSM_DEBUG_UART_BASE 0xFA71E000
51#define MSM_DEBUG_UART_PHYS 0xF991E000
52#endif
53
49 .macro addruart, rp, rv, tmp 54 .macro addruart, rp, rv, tmp
50#ifdef MSM_DEBUG_UART_PHYS 55#ifdef MSM_DEBUG_UART_PHYS
51 ldr \rp, =MSM_DEBUG_UART_PHYS 56 ldr \rp, =MSM_DEBUG_UART_PHYS
diff --git a/arch/arm/include/debug/pl01x.S b/arch/arm/include/debug/pl01x.S
index 37c6895b87e6..92ef808a2337 100644
--- a/arch/arm/include/debug/pl01x.S
+++ b/arch/arm/include/debug/pl01x.S
@@ -25,12 +25,14 @@
25 25
26 .macro waituart,rd,rx 26 .macro waituart,rd,rx
271001: ldr \rd, [\rx, #UART01x_FR] 271001: ldr \rd, [\rx, #UART01x_FR]
28 ARM_BE8( rev \rd, \rd )
28 tst \rd, #UART01x_FR_TXFF 29 tst \rd, #UART01x_FR_TXFF
29 bne 1001b 30 bne 1001b
30 .endm 31 .endm
31 32
32 .macro busyuart,rd,rx 33 .macro busyuart,rd,rx
331001: ldr \rd, [\rx, #UART01x_FR] 341001: ldr \rd, [\rx, #UART01x_FR]
35 ARM_BE8( rev \rd, \rd )
34 tst \rd, #UART01x_FR_BUSY 36 tst \rd, #UART01x_FR_BUSY
35 bne 1001b 37 bne 1001b
36 .endm 38 .endm
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 18d76fd5a2af..70a1c9da30ca 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += hwcap.h
7header-y += ioctls.h 7header-y += ioctls.h
8header-y += kvm_para.h 8header-y += kvm_para.h
9header-y += mman.h 9header-y += mman.h
10header-y += perf_regs.h
10header-y += posix_types.h 11header-y += posix_types.h
11header-y += ptrace.h 12header-y += ptrace.h
12header-y += setup.h 13header-y += setup.h
diff --git a/arch/arm/include/uapi/asm/perf_regs.h b/arch/arm/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..ce59448458b2
--- /dev/null
+++ b/arch/arm/include/uapi/asm/perf_regs.h
@@ -0,0 +1,23 @@
1#ifndef _ASM_ARM_PERF_REGS_H
2#define _ASM_ARM_PERF_REGS_H
3
4enum perf_event_arm_regs {
5 PERF_REG_ARM_R0,
6 PERF_REG_ARM_R1,
7 PERF_REG_ARM_R2,
8 PERF_REG_ARM_R3,
9 PERF_REG_ARM_R4,
10 PERF_REG_ARM_R5,
11 PERF_REG_ARM_R6,
12 PERF_REG_ARM_R7,
13 PERF_REG_ARM_R8,
14 PERF_REG_ARM_R9,
15 PERF_REG_ARM_R10,
16 PERF_REG_ARM_FP,
17 PERF_REG_ARM_IP,
18 PERF_REG_ARM_SP,
19 PERF_REG_ARM_LR,
20 PERF_REG_ARM_PC,
21 PERF_REG_ARM_MAX,
22};
23#endif /* _ASM_ARM_PERF_REGS_H */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 5140df5f23aa..a30fc9be9e9e 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg
17 17
18obj-y := elf.o entry-common.o irq.o opcodes.o \ 18obj-y := elf.o entry-common.o irq.o opcodes.o \
19 process.o ptrace.o return_address.o \ 19 process.o ptrace.o return_address.o \
20 setup.o signal.o stacktrace.o sys_arm.o time.o traps.o 20 setup.o signal.o sigreturn_codes.o \
21 stacktrace.o sys_arm.o time.o traps.o
21 22
22obj-$(CONFIG_ATAGS) += atags_parse.o 23obj-$(CONFIG_ATAGS) += atags_parse.o
23obj-$(CONFIG_ATAGS_PROC) += atags_proc.o 24obj-$(CONFIG_ATAGS_PROC) += atags_proc.o
@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3) += xscale-cp0.o
78obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o 79obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o
79obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o 80obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o
80obj-$(CONFIG_IWMMXT) += iwmmxt.o 81obj-$(CONFIG_IWMMXT) += iwmmxt.o
82obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
81obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o 83obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o
82AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt 84AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt
83obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o 85obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 60d3b738d420..1f031ddd0667 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
155 155
156#ifdef CONFIG_ARM_PATCH_PHYS_VIRT 156#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
157EXPORT_SYMBOL(__pv_phys_offset); 157EXPORT_SYMBOL(__pv_phys_offset);
158EXPORT_SYMBOL(__pv_offset);
158#endif 159#endif
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9cbe70c8b0ef..b3fb8c9e1ff2 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -192,6 +192,7 @@ __dabt_svc:
192 svc_entry 192 svc_entry
193 mov r2, sp 193 mov r2, sp
194 dabt_helper 194 dabt_helper
195 THUMB( ldr r5, [sp, #S_PSR] ) @ potentially updated CPSR
195 svc_exit r5 @ return from exception 196 svc_exit r5 @ return from exception
196 UNWIND(.fnend ) 197 UNWIND(.fnend )
197ENDPROC(__dabt_svc) 198ENDPROC(__dabt_svc)
@@ -416,9 +417,8 @@ __und_usr:
416 bne __und_usr_thumb 417 bne __und_usr_thumb
417 sub r4, r2, #4 @ ARM instr at LR - 4 418 sub r4, r2, #4 @ ARM instr at LR - 4
4181: ldrt r0, [r4] 4191: ldrt r0, [r4]
419#ifdef CONFIG_CPU_ENDIAN_BE8 420 ARM_BE8(rev r0, r0) @ little endian instruction
420 rev r0, r0 @ little endian instruction 421
421#endif
422 @ r0 = 32-bit ARM instruction which caused the exception 422 @ r0 = 32-bit ARM instruction which caused the exception
423 @ r2 = PC value for the following instruction (:= regs->ARM_pc) 423 @ r2 = PC value for the following instruction (:= regs->ARM_pc)
424 @ r4 = PC value for the faulting instruction 424 @ r4 = PC value for the faulting instruction
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index bc6bd9683ba4..a2dcafdf1bc8 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -393,9 +393,7 @@ ENTRY(vector_swi)
393#else 393#else
394 USER( ldr r10, [lr, #-4] ) @ get SWI instruction 394 USER( ldr r10, [lr, #-4] ) @ get SWI instruction
395#endif 395#endif
396#ifdef CONFIG_CPU_ENDIAN_BE8 396 ARM_BE8(rev r10, r10) @ little endian instruction
397 rev r10, r10 @ little endian instruction
398#endif
399 397
400#elif defined(CONFIG_AEABI) 398#elif defined(CONFIG_AEABI)
401 399
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 476de57dcef2..7801866e626a 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -77,6 +77,7 @@
77 77
78 __HEAD 78 __HEAD
79ENTRY(stext) 79ENTRY(stext)
80 ARM_BE8(setend be ) @ ensure we are in BE8 mode
80 81
81 THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM. 82 THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM.
82 THUMB( bx r9 ) @ If this is a Thumb-2 kernel, 83 THUMB( bx r9 ) @ If this is a Thumb-2 kernel,
@@ -352,6 +353,9 @@ ENTRY(secondary_startup)
352 * the processor type - there is no need to check the machine type 353 * the processor type - there is no need to check the machine type
353 * as it has already been validated by the primary processor. 354 * as it has already been validated by the primary processor.
354 */ 355 */
356
357 ARM_BE8(setend be) @ ensure we are in BE8 mode
358
355#ifdef CONFIG_ARM_VIRT_EXT 359#ifdef CONFIG_ARM_VIRT_EXT
356 bl __hyp_stub_install_secondary 360 bl __hyp_stub_install_secondary
357#endif 361#endif
@@ -555,6 +559,14 @@ ENTRY(fixup_smp)
555 ldmfd sp!, {r4 - r6, pc} 559 ldmfd sp!, {r4 - r6, pc}
556ENDPROC(fixup_smp) 560ENDPROC(fixup_smp)
557 561
562#ifdef __ARMEB__
563#define LOW_OFFSET 0x4
564#define HIGH_OFFSET 0x0
565#else
566#define LOW_OFFSET 0x0
567#define HIGH_OFFSET 0x4
568#endif
569
558#ifdef CONFIG_ARM_PATCH_PHYS_VIRT 570#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
559 571
560/* __fixup_pv_table - patch the stub instructions with the delta between 572/* __fixup_pv_table - patch the stub instructions with the delta between
@@ -565,17 +577,20 @@ ENDPROC(fixup_smp)
565 __HEAD 577 __HEAD
566__fixup_pv_table: 578__fixup_pv_table:
567 adr r0, 1f 579 adr r0, 1f
568 ldmia r0, {r3-r5, r7} 580 ldmia r0, {r3-r7}
569 sub r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET 581 mvn ip, #0
582 subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET
570 add r4, r4, r3 @ adjust table start address 583 add r4, r4, r3 @ adjust table start address
571 add r5, r5, r3 @ adjust table end address 584 add r5, r5, r3 @ adjust table end address
572 add r7, r7, r3 @ adjust __pv_phys_offset address 585 add r6, r6, r3 @ adjust __pv_phys_offset address
573 str r8, [r7] @ save computed PHYS_OFFSET to __pv_phys_offset 586 add r7, r7, r3 @ adjust __pv_offset address
587 str r8, [r6, #LOW_OFFSET] @ save computed PHYS_OFFSET to __pv_phys_offset
588 strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits
574 mov r6, r3, lsr #24 @ constant for add/sub instructions 589 mov r6, r3, lsr #24 @ constant for add/sub instructions
575 teq r3, r6, lsl #24 @ must be 16MiB aligned 590 teq r3, r6, lsl #24 @ must be 16MiB aligned
576THUMB( it ne @ cross section branch ) 591THUMB( it ne @ cross section branch )
577 bne __error 592 bne __error
578 str r6, [r7, #4] @ save to __pv_offset 593 str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits
579 b __fixup_a_pv_table 594 b __fixup_a_pv_table
580ENDPROC(__fixup_pv_table) 595ENDPROC(__fixup_pv_table)
581 596
@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table)
584 .long __pv_table_begin 599 .long __pv_table_begin
585 .long __pv_table_end 600 .long __pv_table_end
5862: .long __pv_phys_offset 6012: .long __pv_phys_offset
602 .long __pv_offset
587 603
588 .text 604 .text
589__fixup_a_pv_table: 605__fixup_a_pv_table:
606 adr r0, 3f
607 ldr r6, [r0]
608 add r6, r6, r3
609 ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word
610 ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word
611 mov r6, r6, lsr #24
612 cmn r0, #1
590#ifdef CONFIG_THUMB2_KERNEL 613#ifdef CONFIG_THUMB2_KERNEL
614 moveq r0, #0x200000 @ set bit 21, mov to mvn instruction
591 lsls r6, #24 615 lsls r6, #24
592 beq 2f 616 beq 2f
593 clz r7, r6 617 clz r7, r6
@@ -601,18 +625,42 @@ __fixup_a_pv_table:
601 b 2f 625 b 2f
6021: add r7, r3 6261: add r7, r3
603 ldrh ip, [r7, #2] 627 ldrh ip, [r7, #2]
604 and ip, 0x8f00 628ARM_BE8(rev16 ip, ip)
605 orr ip, r6 @ mask in offset bits 31-24 629 tst ip, #0x4000
630 and ip, #0x8f00
631 orrne ip, r6 @ mask in offset bits 31-24
632 orreq ip, r0 @ mask in offset bits 7-0
633ARM_BE8(rev16 ip, ip)
606 strh ip, [r7, #2] 634 strh ip, [r7, #2]
635 bne 2f
636 ldrh ip, [r7]
637ARM_BE8(rev16 ip, ip)
638 bic ip, #0x20
639 orr ip, ip, r0, lsr #16
640ARM_BE8(rev16 ip, ip)
641 strh ip, [r7]
6072: cmp r4, r5 6422: cmp r4, r5
608 ldrcc r7, [r4], #4 @ use branch for delay slot 643 ldrcc r7, [r4], #4 @ use branch for delay slot
609 bcc 1b 644 bcc 1b
610 bx lr 645 bx lr
611#else 646#else
647 moveq r0, #0x400000 @ set bit 22, mov to mvn instruction
612 b 2f 648 b 2f
6131: ldr ip, [r7, r3] 6491: ldr ip, [r7, r3]
650#ifdef CONFIG_CPU_ENDIAN_BE8
651 @ in BE8, we load data in BE, but instructions still in LE
652 bic ip, ip, #0xff000000
653 tst ip, #0x000f0000 @ check the rotation field
654 orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24
655 biceq ip, ip, #0x00004000 @ clear bit 22
656 orreq ip, ip, r0, lsl #24 @ mask in offset bits 7-0
657#else
614 bic ip, ip, #0x000000ff 658 bic ip, ip, #0x000000ff
615 orr ip, ip, r6 @ mask in offset bits 31-24 659 tst ip, #0xf00 @ check the rotation field
660 orrne ip, ip, r6 @ mask in offset bits 31-24
661 biceq ip, ip, #0x400000 @ clear bit 22
662 orreq ip, ip, r0 @ mask in offset bits 7-0
663#endif
616 str ip, [r7, r3] 664 str ip, [r7, r3]
6172: cmp r4, r5 6652: cmp r4, r5
618 ldrcc r7, [r4], #4 @ use branch for delay slot 666 ldrcc r7, [r4], #4 @ use branch for delay slot
@@ -621,28 +669,30 @@ __fixup_a_pv_table:
621#endif 669#endif
622ENDPROC(__fixup_a_pv_table) 670ENDPROC(__fixup_a_pv_table)
623 671
672 .align
6733: .long __pv_offset
674
624ENTRY(fixup_pv_table) 675ENTRY(fixup_pv_table)
625 stmfd sp!, {r4 - r7, lr} 676 stmfd sp!, {r4 - r7, lr}
626 ldr r2, 2f @ get address of __pv_phys_offset
627 mov r3, #0 @ no offset 677 mov r3, #0 @ no offset
628 mov r4, r0 @ r0 = table start 678 mov r4, r0 @ r0 = table start
629 add r5, r0, r1 @ r1 = table size 679 add r5, r0, r1 @ r1 = table size
630 ldr r6, [r2, #4] @ get __pv_offset
631 bl __fixup_a_pv_table 680 bl __fixup_a_pv_table
632 ldmfd sp!, {r4 - r7, pc} 681 ldmfd sp!, {r4 - r7, pc}
633ENDPROC(fixup_pv_table) 682ENDPROC(fixup_pv_table)
634 683
635 .align
6362: .long __pv_phys_offset
637
638 .data 684 .data
639 .globl __pv_phys_offset 685 .globl __pv_phys_offset
640 .type __pv_phys_offset, %object 686 .type __pv_phys_offset, %object
641__pv_phys_offset: 687__pv_phys_offset:
642 .long 0 688 .quad 0
643 .size __pv_phys_offset, . - __pv_phys_offset 689 .size __pv_phys_offset, . -__pv_phys_offset
690
691 .globl __pv_offset
692 .type __pv_offset, %object
644__pv_offset: 693__pv_offset:
645 .long 0 694 .quad 0
695 .size __pv_offset, . -__pv_offset
646#endif 696#endif
647 697
648#include "head-common.S" 698#include "head-common.S"
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 7b95de601357..3d446605cbf8 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
344 /* Breakpoint */ 344 /* Breakpoint */
345 ctrl_base = ARM_BASE_BCR; 345 ctrl_base = ARM_BASE_BCR;
346 val_base = ARM_BASE_BVR; 346 val_base = ARM_BASE_BVR;
347 slots = (struct perf_event **)__get_cpu_var(bp_on_reg); 347 slots = this_cpu_ptr(bp_on_reg);
348 max_slots = core_num_brps; 348 max_slots = core_num_brps;
349 } else { 349 } else {
350 /* Watchpoint */ 350 /* Watchpoint */
351 ctrl_base = ARM_BASE_WCR; 351 ctrl_base = ARM_BASE_WCR;
352 val_base = ARM_BASE_WVR; 352 val_base = ARM_BASE_WVR;
353 slots = (struct perf_event **)__get_cpu_var(wp_on_reg); 353 slots = this_cpu_ptr(wp_on_reg);
354 max_slots = core_num_wrps; 354 max_slots = core_num_wrps;
355 } 355 }
356 356
@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
396 if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) { 396 if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
397 /* Breakpoint */ 397 /* Breakpoint */
398 base = ARM_BASE_BCR; 398 base = ARM_BASE_BCR;
399 slots = (struct perf_event **)__get_cpu_var(bp_on_reg); 399 slots = this_cpu_ptr(bp_on_reg);
400 max_slots = core_num_brps; 400 max_slots = core_num_brps;
401 } else { 401 } else {
402 /* Watchpoint */ 402 /* Watchpoint */
403 base = ARM_BASE_WCR; 403 base = ARM_BASE_WCR;
404 slots = (struct perf_event **)__get_cpu_var(wp_on_reg); 404 slots = this_cpu_ptr(wp_on_reg);
405 max_slots = core_num_wrps; 405 max_slots = core_num_wrps;
406 } 406 }
407 407
@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
697 struct arch_hw_breakpoint *info; 697 struct arch_hw_breakpoint *info;
698 struct arch_hw_breakpoint_ctrl ctrl; 698 struct arch_hw_breakpoint_ctrl ctrl;
699 699
700 slots = (struct perf_event **)__get_cpu_var(wp_on_reg); 700 slots = this_cpu_ptr(wp_on_reg);
701 701
702 for (i = 0; i < core_num_wrps; ++i) { 702 for (i = 0; i < core_num_wrps; ++i) {
703 rcu_read_lock(); 703 rcu_read_lock();
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handler(unsigned long pc)
768 struct perf_event *wp, **slots; 768 struct perf_event *wp, **slots;
769 struct arch_hw_breakpoint *info; 769 struct arch_hw_breakpoint *info;
770 770
771 slots = (struct perf_event **)__get_cpu_var(wp_on_reg); 771 slots = this_cpu_ptr(wp_on_reg);
772 772
773 for (i = 0; i < core_num_wrps; ++i) { 773 for (i = 0; i < core_num_wrps; ++i) {
774 rcu_read_lock(); 774 rcu_read_lock();
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
802 struct arch_hw_breakpoint *info; 802 struct arch_hw_breakpoint *info;
803 struct arch_hw_breakpoint_ctrl ctrl; 803 struct arch_hw_breakpoint_ctrl ctrl;
804 804
805 slots = (struct perf_event **)__get_cpu_var(bp_on_reg); 805 slots = this_cpu_ptr(bp_on_reg);
806 806
807 /* The exception entry code places the amended lr in the PC. */ 807 /* The exception entry code places the amended lr in the PC. */
808 addr = regs->ARM_pc; 808 addr = regs->ARM_pc;
diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c
index 170e9f34003f..a7b621ece23d 100644
--- a/arch/arm/kernel/kprobes.c
+++ b/arch/arm/kernel/kprobes.c
@@ -171,13 +171,13 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
171 171
172static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 172static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
173{ 173{
174 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 174 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
175 kcb->kprobe_status = kcb->prev_kprobe.status; 175 kcb->kprobe_status = kcb->prev_kprobe.status;
176} 176}
177 177
178static void __kprobes set_current_kprobe(struct kprobe *p) 178static void __kprobes set_current_kprobe(struct kprobe *p)
179{ 179{
180 __get_cpu_var(current_kprobe) = p; 180 __this_cpu_write(current_kprobe, p);
181} 181}
182 182
183static void __kprobes 183static void __kprobes
@@ -421,10 +421,10 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
421 continue; 421 continue;
422 422
423 if (ri->rp && ri->rp->handler) { 423 if (ri->rp && ri->rp->handler) {
424 __get_cpu_var(current_kprobe) = &ri->rp->kp; 424 __this_cpu_write(current_kprobe, &ri->rp->kp);
425 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; 425 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
426 ri->rp->handler(ri, regs); 426 ri->rp->handler(ri, regs);
427 __get_cpu_var(current_kprobe) = NULL; 427 __this_cpu_write(current_kprobe, NULL);
428 } 428 }
429 429
430 orig_ret_address = (unsigned long)ri->ret_addr; 430 orig_ret_address = (unsigned long)ri->ret_addr;
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index c9dfff3b8008..45e478157278 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -24,6 +24,7 @@
24#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/smp_plat.h> 25#include <asm/smp_plat.h>
26#include <asm/unwind.h> 26#include <asm/unwind.h>
27#include <asm/opcodes.h>
27 28
28#ifdef CONFIG_XIP_KERNEL 29#ifdef CONFIG_XIP_KERNEL
29/* 30/*
@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
60 Elf32_Sym *sym; 61 Elf32_Sym *sym;
61 const char *symname; 62 const char *symname;
62 s32 offset; 63 s32 offset;
64 u32 tmp;
63#ifdef CONFIG_THUMB2_KERNEL 65#ifdef CONFIG_THUMB2_KERNEL
64 u32 upper, lower, sign, j1, j2; 66 u32 upper, lower, sign, j1, j2;
65#endif 67#endif
@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
95 case R_ARM_PC24: 97 case R_ARM_PC24:
96 case R_ARM_CALL: 98 case R_ARM_CALL:
97 case R_ARM_JUMP24: 99 case R_ARM_JUMP24:
98 offset = (*(u32 *)loc & 0x00ffffff) << 2; 100 offset = __mem_to_opcode_arm(*(u32 *)loc);
101 offset = (offset & 0x00ffffff) << 2;
99 if (offset & 0x02000000) 102 if (offset & 0x02000000)
100 offset -= 0x04000000; 103 offset -= 0x04000000;
101 104
@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
111 } 114 }
112 115
113 offset >>= 2; 116 offset >>= 2;
117 offset &= 0x00ffffff;
114 118
115 *(u32 *)loc &= 0xff000000; 119 *(u32 *)loc &= __opcode_to_mem_arm(0xff000000);
116 *(u32 *)loc |= offset & 0x00ffffff; 120 *(u32 *)loc |= __opcode_to_mem_arm(offset);
117 break; 121 break;
118 122
119 case R_ARM_V4BX: 123 case R_ARM_V4BX:
@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
121 * other bits to re-code instruction as 125 * other bits to re-code instruction as
122 * MOV PC,Rm. 126 * MOV PC,Rm.
123 */ 127 */
124 *(u32 *)loc &= 0xf000000f; 128 *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f);
125 *(u32 *)loc |= 0x01a0f000; 129 *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000);
126 break; 130 break;
127 131
128 case R_ARM_PREL31: 132 case R_ARM_PREL31:
@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
132 136
133 case R_ARM_MOVW_ABS_NC: 137 case R_ARM_MOVW_ABS_NC:
134 case R_ARM_MOVT_ABS: 138 case R_ARM_MOVT_ABS:
135 offset = *(u32 *)loc; 139 offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
136 offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff); 140 offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
137 offset = (offset ^ 0x8000) - 0x8000; 141 offset = (offset ^ 0x8000) - 0x8000;
138 142
@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
140 if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS) 144 if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
141 offset >>= 16; 145 offset >>= 16;
142 146
143 *(u32 *)loc &= 0xfff0f000; 147 tmp &= 0xfff0f000;
144 *(u32 *)loc |= ((offset & 0xf000) << 4) | 148 tmp |= ((offset & 0xf000) << 4) |
145 (offset & 0x0fff); 149 (offset & 0x0fff);
150
151 *(u32 *)loc = __opcode_to_mem_arm(tmp);
146 break; 152 break;
147 153
148#ifdef CONFIG_THUMB2_KERNEL 154#ifdef CONFIG_THUMB2_KERNEL
149 case R_ARM_THM_CALL: 155 case R_ARM_THM_CALL:
150 case R_ARM_THM_JUMP24: 156 case R_ARM_THM_JUMP24:
151 upper = *(u16 *)loc; 157 upper = __mem_to_opcode_thumb16(*(u16 *)loc);
152 lower = *(u16 *)(loc + 2); 158 lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
153 159
154 /* 160 /*
155 * 25 bit signed address range (Thumb-2 BL and B.W 161 * 25 bit signed address range (Thumb-2 BL and B.W
@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
198 sign = (offset >> 24) & 1; 204 sign = (offset >> 24) & 1;
199 j1 = sign ^ (~(offset >> 23) & 1); 205 j1 = sign ^ (~(offset >> 23) & 1);
200 j2 = sign ^ (~(offset >> 22) & 1); 206 j2 = sign ^ (~(offset >> 22) & 1);
201 *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) | 207 upper = (u16)((upper & 0xf800) | (sign << 10) |
202 ((offset >> 12) & 0x03ff)); 208 ((offset >> 12) & 0x03ff));
203 *(u16 *)(loc + 2) = (u16)((lower & 0xd000) | 209 lower = (u16)((lower & 0xd000) |
204 (j1 << 13) | (j2 << 11) | 210 (j1 << 13) | (j2 << 11) |
205 ((offset >> 1) & 0x07ff)); 211 ((offset >> 1) & 0x07ff));
212
213 *(u16 *)loc = __opcode_to_mem_thumb16(upper);
214 *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
206 break; 215 break;
207 216
208 case R_ARM_THM_MOVW_ABS_NC: 217 case R_ARM_THM_MOVW_ABS_NC:
209 case R_ARM_THM_MOVT_ABS: 218 case R_ARM_THM_MOVT_ABS:
210 upper = *(u16 *)loc; 219 upper = __mem_to_opcode_thumb16(*(u16 *)loc);
211 lower = *(u16 *)(loc + 2); 220 lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
212 221
213 /* 222 /*
214 * MOVT/MOVW instructions encoding in Thumb-2: 223 * MOVT/MOVW instructions encoding in Thumb-2:
@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
229 if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS) 238 if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
230 offset >>= 16; 239 offset >>= 16;
231 240
232 *(u16 *)loc = (u16)((upper & 0xfbf0) | 241 upper = (u16)((upper & 0xfbf0) |
233 ((offset & 0xf000) >> 12) | 242 ((offset & 0xf000) >> 12) |
234 ((offset & 0x0800) >> 1)); 243 ((offset & 0x0800) >> 1));
235 *(u16 *)(loc + 2) = (u16)((lower & 0x8f00) | 244 lower = (u16)((lower & 0x8f00) |
236 ((offset & 0x0700) << 4) | 245 ((offset & 0x0700) << 4) |
237 (offset & 0x00ff)); 246 (offset & 0x00ff));
247 *(u16 *)loc = __opcode_to_mem_thumb16(upper);
248 *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
238 break; 249 break;
239#endif 250#endif
240 251
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index e186ee1e63f6..bc3f2efa0d86 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events,
256 struct perf_event *event) 256 struct perf_event *event)
257{ 257{
258 struct arm_pmu *armpmu = to_arm_pmu(event->pmu); 258 struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
259 struct pmu *leader_pmu = event->group_leader->pmu;
260 259
261 if (is_software_event(event)) 260 if (is_software_event(event))
262 return 1; 261 return 1;
263 262
264 if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF) 263 if (event->state < PERF_EVENT_STATE_OFF)
265 return 1; 264 return 1;
266 265
267 if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) 266 if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 8d6147b2001f..d85055cd24ba 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters);
68 68
69static struct pmu_hw_events *cpu_pmu_get_cpu_events(void) 69static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
70{ 70{
71 return &__get_cpu_var(cpu_hw_events); 71 return this_cpu_ptr(&cpu_hw_events);
72} 72}
73 73
74static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu) 74static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
new file mode 100644
index 000000000000..6e4379c67cbc
--- /dev/null
+++ b/arch/arm/kernel/perf_regs.c
@@ -0,0 +1,30 @@
1
2#include <linux/errno.h>
3#include <linux/kernel.h>
4#include <linux/perf_event.h>
5#include <linux/bug.h>
6#include <asm/perf_regs.h>
7#include <asm/ptrace.h>
8
9u64 perf_reg_value(struct pt_regs *regs, int idx)
10{
11 if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
12 return 0;
13
14 return regs->uregs[idx];
15}
16
17#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
18
19int perf_reg_validate(u64 mask)
20{
21 if (!mask || mask & REG_RESERVED)
22 return -EINVAL;
23
24 return 0;
25}
26
27u64 perf_reg_abi(struct task_struct *task)
28{
29 return PERF_SAMPLE_REGS_ABI_32;
30}
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 5d65438685d8..6a1b8a81b1ae 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup);
73#endif 73#endif
74 74
75extern void paging_init(const struct machine_desc *desc); 75extern void paging_init(const struct machine_desc *desc);
76extern void early_paging_init(const struct machine_desc *,
77 struct proc_info_list *);
76extern void sanity_check_meminfo(void); 78extern void sanity_check_meminfo(void);
77extern enum reboot_mode reboot_mode; 79extern enum reboot_mode reboot_mode;
78extern void setup_dma_zone(const struct machine_desc *desc); 80extern void setup_dma_zone(const struct machine_desc *desc);
@@ -599,6 +601,8 @@ static void __init setup_processor(void)
599 elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT); 601 elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
600#endif 602#endif
601 603
604 erratum_a15_798181_init();
605
602 feat_v6_fixup(); 606 feat_v6_fixup();
603 607
604 cacheid_init(); 608 cacheid_init();
@@ -619,9 +623,10 @@ void __init dump_machine_table(void)
619 /* can't use cpu_relax() here as it may require MMU setup */; 623 /* can't use cpu_relax() here as it may require MMU setup */;
620} 624}
621 625
622int __init arm_add_memory(phys_addr_t start, phys_addr_t size) 626int __init arm_add_memory(u64 start, u64 size)
623{ 627{
624 struct membank *bank = &meminfo.bank[meminfo.nr_banks]; 628 struct membank *bank = &meminfo.bank[meminfo.nr_banks];
629 u64 aligned_start;
625 630
626 if (meminfo.nr_banks >= NR_BANKS) { 631 if (meminfo.nr_banks >= NR_BANKS) {
627 printk(KERN_CRIT "NR_BANKS too low, " 632 printk(KERN_CRIT "NR_BANKS too low, "
@@ -634,10 +639,16 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
634 * Size is appropriately rounded down, start is rounded up. 639 * Size is appropriately rounded down, start is rounded up.
635 */ 640 */
636 size -= start & ~PAGE_MASK; 641 size -= start & ~PAGE_MASK;
637 bank->start = PAGE_ALIGN(start); 642 aligned_start = PAGE_ALIGN(start);
638 643
639#ifndef CONFIG_ARM_LPAE 644#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
640 if (bank->start + size < bank->start) { 645 if (aligned_start > ULONG_MAX) {
646 printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "
647 "32-bit physical address space\n", (long long)start);
648 return -EINVAL;
649 }
650
651 if (aligned_start + size > ULONG_MAX) {
641 printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in " 652 printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in "
642 "32-bit physical address space\n", (long long)start); 653 "32-bit physical address space\n", (long long)start);
643 /* 654 /*
@@ -645,10 +656,11 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
645 * 32 bits, we use ULONG_MAX as the upper limit rather than 4GB. 656 * 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.
646 * This means we lose a page after masking. 657 * This means we lose a page after masking.
647 */ 658 */
648 size = ULONG_MAX - bank->start; 659 size = ULONG_MAX - aligned_start;
649 } 660 }
650#endif 661#endif
651 662
663 bank->start = aligned_start;
652 bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1); 664 bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);
653 665
654 /* 666 /*
@@ -669,8 +681,8 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
669static int __init early_mem(char *p) 681static int __init early_mem(char *p)
670{ 682{
671 static int usermem __initdata = 0; 683 static int usermem __initdata = 0;
672 phys_addr_t size; 684 u64 size;
673 phys_addr_t start; 685 u64 start;
674 char *endp; 686 char *endp;
675 687
676 /* 688 /*
@@ -878,6 +890,8 @@ void __init setup_arch(char **cmdline_p)
878 parse_early_param(); 890 parse_early_param();
879 891
880 sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); 892 sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
893
894 early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
881 sanity_check_meminfo(); 895 sanity_check_meminfo();
882 arm_memblock_init(&meminfo, mdesc); 896 arm_memblock_init(&meminfo, mdesc);
883 897
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index ab3304225272..04d63880037f 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -21,29 +21,7 @@
21#include <asm/unistd.h> 21#include <asm/unistd.h>
22#include <asm/vfp.h> 22#include <asm/vfp.h>
23 23
24/* 24extern const unsigned long sigreturn_codes[7];
25 * For ARM syscalls, we encode the syscall number into the instruction.
26 */
27#define SWI_SYS_SIGRETURN (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
28#define SWI_SYS_RT_SIGRETURN (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
29
30/*
31 * With EABI, the syscall number has to be loaded into r7.
32 */
33#define MOV_R7_NR_SIGRETURN (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE))
34#define MOV_R7_NR_RT_SIGRETURN (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
35
36/*
37 * For Thumb syscalls, we pass the syscall number via r7. We therefore
38 * need two 16-bit instructions.
39 */
40#define SWI_THUMB_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE))
41#define SWI_THUMB_RT_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
42
43static const unsigned long sigreturn_codes[7] = {
44 MOV_R7_NR_SIGRETURN, SWI_SYS_SIGRETURN, SWI_THUMB_SIGRETURN,
45 MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN,
46};
47 25
48static unsigned long signal_return_offset; 26static unsigned long signal_return_offset;
49 27
@@ -375,12 +353,18 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig,
375 */ 353 */
376 thumb = handler & 1; 354 thumb = handler & 1;
377 355
378 if (thumb) {
379 cpsr |= PSR_T_BIT;
380#if __LINUX_ARM_ARCH__ >= 7 356#if __LINUX_ARM_ARCH__ >= 7
381 /* clear the If-Then Thumb-2 execution state */ 357 /*
382 cpsr &= ~PSR_IT_MASK; 358 * Clear the If-Then Thumb-2 execution state
359 * ARM spec requires this to be all 000s in ARM mode
360 * Snapdragon S4/Krait misbehaves on a Thumb=>ARM
361 * signal transition without this.
362 */
363 cpsr &= ~PSR_IT_MASK;
383#endif 364#endif
365
366 if (thumb) {
367 cpsr |= PSR_T_BIT;
384 } else 368 } else
385 cpsr &= ~PSR_T_BIT; 369 cpsr &= ~PSR_T_BIT;
386 } 370 }
diff --git a/arch/arm/kernel/sigreturn_codes.S b/arch/arm/kernel/sigreturn_codes.S
new file mode 100644
index 000000000000..3c5d0f2170fd
--- /dev/null
+++ b/arch/arm/kernel/sigreturn_codes.S
@@ -0,0 +1,80 @@
1/*
2 * sigreturn_codes.S - code sinpets for sigreturn syscalls
3 *
4 * Created by: Victor Kamensky, 2013-08-13
5 * Copyright: (C) 2013 Linaro Limited
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */
16
17#include <asm/unistd.h>
18
19/*
20 * For ARM syscalls, we encode the syscall number into the instruction.
21 * With EABI, the syscall number has to be loaded into r7. As result
22 * ARM syscall sequence snippet will have move and svc in .arm encoding
23 *
24 * For Thumb syscalls, we pass the syscall number via r7. We therefore
25 * need two 16-bit instructions in .thumb encoding
26 *
27 * Please note sigreturn_codes code are not executed in place. Instead
28 * they just copied by kernel into appropriate places. Code inside of
29 * arch/arm/kernel/signal.c is very sensitive to layout of these code
30 * snippets.
31 */
32
33#if __LINUX_ARM_ARCH__ <= 4
34 /*
35 * Note we manually set minimally required arch that supports
36 * required thumb opcodes for early arch versions. It is OK
37 * for this file to be used in combination with other
38 * lower arch variants, since these code snippets are only
39 * used as input data.
40 */
41 .arch armv4t
42#endif
43
44 .section .rodata
45 .global sigreturn_codes
46 .type sigreturn_codes, #object
47
48 .arm
49
50sigreturn_codes:
51
52 /* ARM sigreturn syscall code snippet */
53 mov r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
54 swi #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)
55
56 /* Thumb sigreturn syscall code snippet */
57 .thumb
58 movs r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
59 swi #0
60
61 /* ARM sigreturn_rt syscall code snippet */
62 .arm
63 mov r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
64 swi #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)
65
66 /* Thumb sigreturn_rt syscall code snippet */
67 .thumb
68 movs r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
69 swi #0
70
71 /*
72 * Note on addtional space: setup_return in signal.c
73 * algorithm uses two words copy regardless whether
74 * it is thumb case or not, so we need additional
75 * word after real last entry.
76 */
77 .arm
78 .space 4
79
80 .size sigreturn_codes, . - sigreturn_codes
diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S
index db1536b8b30b..b907d9b790ab 100644
--- a/arch/arm/kernel/sleep.S
+++ b/arch/arm/kernel/sleep.S
@@ -55,6 +55,7 @@
55 * specific registers and some other data for resume. 55 * specific registers and some other data for resume.
56 * r0 = suspend function arg0 56 * r0 = suspend function arg0
57 * r1 = suspend function 57 * r1 = suspend function
58 * r2 = MPIDR value the resuming CPU will use
58 */ 59 */
59ENTRY(__cpu_suspend) 60ENTRY(__cpu_suspend)
60 stmfd sp!, {r4 - r11, lr} 61 stmfd sp!, {r4 - r11, lr}
@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend)
67 mov r5, sp @ current virtual SP 68 mov r5, sp @ current virtual SP
68 add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn 69 add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn
69 sub sp, sp, r4 @ allocate CPU state on stack 70 sub sp, sp, r4 @ allocate CPU state on stack
70 stmfd sp!, {r0, r1} @ save suspend func arg and pointer
71 add r0, sp, #8 @ save pointer to save block
72 mov r1, r4 @ size of save block
73 mov r2, r5 @ virtual SP
74 ldr r3, =sleep_save_sp 71 ldr r3, =sleep_save_sp
72 stmfd sp!, {r0, r1} @ save suspend func arg and pointer
75 ldr r3, [r3, #SLEEP_SAVE_SP_VIRT] 73 ldr r3, [r3, #SLEEP_SAVE_SP_VIRT]
76 ALT_SMP(mrc p15, 0, r9, c0, c0, 5) 74 ALT_SMP(ldr r0, =mpidr_hash)
77 ALT_UP_B(1f) 75 ALT_UP_B(1f)
78 ldr r8, =mpidr_hash 76 /* This ldmia relies on the memory layout of the mpidr_hash struct */
79 /* 77 ldmia r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
80 * This ldmia relies on the memory layout of the mpidr_hash 78 compute_mpidr_hash r0, r6, r7, r8, r2, r1
81 * struct mpidr_hash. 79 add r3, r3, r0, lsl #2
82 */ 801: mov r2, r5 @ virtual SP
83 ldmia r8, {r4-r7} @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts 81 mov r1, r4 @ size of save block
84 compute_mpidr_hash lr, r5, r6, r7, r9, r4 82 add r0, sp, #8 @ pointer to save block
85 add r3, r3, lr, lsl #2
861:
87 bl __cpu_suspend_save 83 bl __cpu_suspend_save
88 adr lr, BSYM(cpu_suspend_abort) 84 adr lr, BSYM(cpu_suspend_abort)
89 ldmfd sp!, {r0, pc} @ call suspend fn 85 ldmfd sp!, {r0, pc} @ call suspend fn
@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu)
130 .data 126 .data
131 .align 127 .align
132ENTRY(cpu_resume) 128ENTRY(cpu_resume)
129ARM_BE8(setend be) @ ensure we are in BE mode
133 mov r1, #0 130 mov r1, #0
134 ALT_SMP(mrc p15, 0, r0, c0, c0, 5) 131 ALT_SMP(mrc p15, 0, r0, c0, c0, 5)
135 ALT_UP_B(1f) 132 ALT_UP_B(1f)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 72024ea8a3a6..dc894ab3622b 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -25,6 +25,7 @@
25#include <linux/clockchips.h> 25#include <linux/clockchips.h>
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/irq_work.h>
28 29
29#include <linux/atomic.h> 30#include <linux/atomic.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
@@ -66,6 +67,8 @@ enum ipi_msg_type {
66 IPI_CALL_FUNC, 67 IPI_CALL_FUNC,
67 IPI_CALL_FUNC_SINGLE, 68 IPI_CALL_FUNC_SINGLE,
68 IPI_CPU_STOP, 69 IPI_CPU_STOP,
70 IPI_IRQ_WORK,
71 IPI_COMPLETION,
69}; 72};
70 73
71static DECLARE_COMPLETION(cpu_running); 74static DECLARE_COMPLETION(cpu_running);
@@ -80,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops)
80 83
81static unsigned long get_arch_pgd(pgd_t *pgd) 84static unsigned long get_arch_pgd(pgd_t *pgd)
82{ 85{
83 phys_addr_t pgdir = virt_to_phys(pgd); 86 phys_addr_t pgdir = virt_to_idmap(pgd);
84 BUG_ON(pgdir & ARCH_PGD_MASK); 87 BUG_ON(pgdir & ARCH_PGD_MASK);
85 return pgdir >> ARCH_PGD_SHIFT; 88 return pgdir >> ARCH_PGD_SHIFT;
86} 89}
@@ -448,6 +451,14 @@ void arch_send_call_function_single_ipi(int cpu)
448 smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); 451 smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
449} 452}
450 453
454#ifdef CONFIG_IRQ_WORK
455void arch_irq_work_raise(void)
456{
457 if (is_smp())
458 smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
459}
460#endif
461
451static const char *ipi_types[NR_IPI] = { 462static const char *ipi_types[NR_IPI] = {
452#define S(x,s) [x] = s 463#define S(x,s) [x] = s
453 S(IPI_WAKEUP, "CPU wakeup interrupts"), 464 S(IPI_WAKEUP, "CPU wakeup interrupts"),
@@ -456,6 +467,8 @@ static const char *ipi_types[NR_IPI] = {
456 S(IPI_CALL_FUNC, "Function call interrupts"), 467 S(IPI_CALL_FUNC, "Function call interrupts"),
457 S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), 468 S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
458 S(IPI_CPU_STOP, "CPU stop interrupts"), 469 S(IPI_CPU_STOP, "CPU stop interrupts"),
470 S(IPI_IRQ_WORK, "IRQ work interrupts"),
471 S(IPI_COMPLETION, "completion interrupts"),
459}; 472};
460 473
461void show_ipi_list(struct seq_file *p, int prec) 474void show_ipi_list(struct seq_file *p, int prec)
@@ -515,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu)
515 cpu_relax(); 528 cpu_relax();
516} 529}
517 530
531static DEFINE_PER_CPU(struct completion *, cpu_completion);
532
533int register_ipi_completion(struct completion *completion, int cpu)
534{
535 per_cpu(cpu_completion, cpu) = completion;
536 return IPI_COMPLETION;
537}
538
539static void ipi_complete(unsigned int cpu)
540{
541 complete(per_cpu(cpu_completion, cpu));
542}
543
518/* 544/*
519 * Main handler for inter-processor interrupts 545 * Main handler for inter-processor interrupts
520 */ 546 */
@@ -565,6 +591,20 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
565 irq_exit(); 591 irq_exit();
566 break; 592 break;
567 593
594#ifdef CONFIG_IRQ_WORK
595 case IPI_IRQ_WORK:
596 irq_enter();
597 irq_work_run();
598 irq_exit();
599 break;
600#endif
601
602 case IPI_COMPLETION:
603 irq_enter();
604 ipi_complete(cpu);
605 irq_exit();
606 break;
607
568 default: 608 default:
569 printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n", 609 printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
570 cpu, ipinr); 610 cpu, ipinr);
diff --git a/arch/arm/kernel/smp_scu.c b/arch/arm/kernel/smp_scu.c
index 5bc1a63284e3..1aafa0d785eb 100644
--- a/arch/arm/kernel/smp_scu.c
+++ b/arch/arm/kernel/smp_scu.c
@@ -28,7 +28,7 @@
28 */ 28 */
29unsigned int __init scu_get_core_count(void __iomem *scu_base) 29unsigned int __init scu_get_core_count(void __iomem *scu_base)
30{ 30{
31 unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG); 31 unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG);
32 return (ncores & 0x03) + 1; 32 return (ncores & 0x03) + 1;
33} 33}
34 34
@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base)
42#ifdef CONFIG_ARM_ERRATA_764369 42#ifdef CONFIG_ARM_ERRATA_764369
43 /* Cortex-A9 only */ 43 /* Cortex-A9 only */
44 if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) { 44 if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) {
45 scu_ctrl = __raw_readl(scu_base + 0x30); 45 scu_ctrl = readl_relaxed(scu_base + 0x30);
46 if (!(scu_ctrl & 1)) 46 if (!(scu_ctrl & 1))
47 __raw_writel(scu_ctrl | 0x1, scu_base + 0x30); 47 writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30);
48 } 48 }
49#endif 49#endif
50 50
51 scu_ctrl = __raw_readl(scu_base + SCU_CTRL); 51 scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
52 /* already enabled? */ 52 /* already enabled? */
53 if (scu_ctrl & 1) 53 if (scu_ctrl & 1)
54 return; 54 return;
55 55
56 scu_ctrl |= 1; 56 scu_ctrl |= 1;
57 __raw_writel(scu_ctrl, scu_base + SCU_CTRL); 57 writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
58 58
59 /* 59 /*
60 * Ensure that the data accessed by CPU0 before the SCU was 60 * Ensure that the data accessed by CPU0 before the SCU was
@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode)
80 if (mode > 3 || mode == 1 || cpu > 3) 80 if (mode > 3 || mode == 1 || cpu > 3)
81 return -EINVAL; 81 return -EINVAL;
82 82
83 val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03; 83 val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
84 val |= mode; 84 val |= mode;
85 __raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu); 85 writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu);
86 86
87 return 0; 87 return 0;
88} 88}
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
index 83ccca303df8..95d063620b76 100644
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -70,6 +70,40 @@ static inline void ipi_flush_bp_all(void *ignored)
70 local_flush_bp_all(); 70 local_flush_bp_all();
71} 71}
72 72
73#ifdef CONFIG_ARM_ERRATA_798181
74bool (*erratum_a15_798181_handler)(void);
75
76static bool erratum_a15_798181_partial(void)
77{
78 asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
79 dsb(ish);
80 return false;
81}
82
83static bool erratum_a15_798181_broadcast(void)
84{
85 asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
86 dsb(ish);
87 return true;
88}
89
90void erratum_a15_798181_init(void)
91{
92 unsigned int midr = read_cpuid_id();
93 unsigned int revidr = read_cpuid(CPUID_REVIDR);
94
95 /* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
96 if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 ||
97 (revidr & 0x210) == 0x210) {
98 return;
99 }
100 if (revidr & 0x10)
101 erratum_a15_798181_handler = erratum_a15_798181_partial;
102 else
103 erratum_a15_798181_handler = erratum_a15_798181_broadcast;
104}
105#endif
106
73static void ipi_flush_tlb_a15_erratum(void *arg) 107static void ipi_flush_tlb_a15_erratum(void *arg)
74{ 108{
75 dmb(); 109 dmb();
@@ -80,7 +114,6 @@ static void broadcast_tlb_a15_erratum(void)
80 if (!erratum_a15_798181()) 114 if (!erratum_a15_798181())
81 return; 115 return;
82 116
83 dummy_flush_tlb_a15_erratum();
84 smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1); 117 smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1);
85} 118}
86 119
@@ -92,7 +125,6 @@ static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm)
92 if (!erratum_a15_798181()) 125 if (!erratum_a15_798181())
93 return; 126 return;
94 127
95 dummy_flush_tlb_a15_erratum();
96 this_cpu = get_cpu(); 128 this_cpu = get_cpu();
97 a15_erratum_get_cpumask(this_cpu, mm, &mask); 129 a15_erratum_get_cpumask(this_cpu, mm, &mask);
98 smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1); 130 smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1);
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index 2985c9f0905d..6591e26fc13f 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode,
45 case CLOCK_EVT_MODE_PERIODIC: 45 case CLOCK_EVT_MODE_PERIODIC:
46 ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE 46 ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE
47 | TWD_TIMER_CONTROL_PERIODIC; 47 | TWD_TIMER_CONTROL_PERIODIC;
48 __raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ), 48 writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
49 twd_base + TWD_TIMER_LOAD); 49 twd_base + TWD_TIMER_LOAD);
50 break; 50 break;
51 case CLOCK_EVT_MODE_ONESHOT: 51 case CLOCK_EVT_MODE_ONESHOT:
@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode,
58 ctrl = 0; 58 ctrl = 0;
59 } 59 }
60 60
61 __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); 61 writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
62} 62}
63 63
64static int twd_set_next_event(unsigned long evt, 64static int twd_set_next_event(unsigned long evt,
65 struct clock_event_device *unused) 65 struct clock_event_device *unused)
66{ 66{
67 unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL); 67 unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL);
68 68
69 ctrl |= TWD_TIMER_CONTROL_ENABLE; 69 ctrl |= TWD_TIMER_CONTROL_ENABLE;
70 70
71 __raw_writel(evt, twd_base + TWD_TIMER_COUNTER); 71 writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER);
72 __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); 72 writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
73 73
74 return 0; 74 return 0;
75} 75}
@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt,
82 */ 82 */
83static int twd_timer_ack(void) 83static int twd_timer_ack(void)
84{ 84{
85 if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) { 85 if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) {
86 __raw_writel(1, twd_base + TWD_TIMER_INTSTAT); 86 writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT);
87 return 1; 87 return 1;
88 } 88 }
89 89
@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void)
211 waitjiffies += 5; 211 waitjiffies += 5;
212 212
213 /* enable, no interrupt or reload */ 213 /* enable, no interrupt or reload */
214 __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL); 214 writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL);
215 215
216 /* maximum value */ 216 /* maximum value */
217 __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); 217 writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
218 218
219 while (get_jiffies_64() < waitjiffies) 219 while (get_jiffies_64() < waitjiffies)
220 udelay(10); 220 udelay(10);
221 221
222 count = __raw_readl(twd_base + TWD_TIMER_COUNTER); 222 count = readl_relaxed(twd_base + TWD_TIMER_COUNTER);
223 223
224 twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5); 224 twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
225 225
@@ -277,7 +277,7 @@ static void twd_timer_setup(void)
277 * bother with the below. 277 * bother with the below.
278 */ 278 */
279 if (per_cpu(percpu_setup_called, cpu)) { 279 if (per_cpu(percpu_setup_called, cpu)) {
280 __raw_writel(0, twd_base + TWD_TIMER_CONTROL); 280 writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
281 clockevents_register_device(clk); 281 clockevents_register_device(clk);
282 enable_percpu_irq(clk->irq, 0); 282 enable_percpu_irq(clk->irq, 0);
283 return; 283 return;
@@ -290,7 +290,7 @@ static void twd_timer_setup(void)
290 * The following is done once per CPU the first time .setup() is 290 * The following is done once per CPU the first time .setup() is
291 * called. 291 * called.
292 */ 292 */
293 __raw_writel(0, twd_base + TWD_TIMER_CONTROL); 293 writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
294 294
295 clk->name = "local_timer"; 295 clk->name = "local_timer";
296 clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | 296 clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c
index 41cf3cbf756d..2835d35234ca 100644
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -10,7 +10,7 @@
10#include <asm/suspend.h> 10#include <asm/suspend.h>
11#include <asm/tlbflush.h> 11#include <asm/tlbflush.h>
12 12
13extern int __cpu_suspend(unsigned long, int (*)(unsigned long)); 13extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
14extern void cpu_resume_mmu(void); 14extern void cpu_resume_mmu(void);
15 15
16#ifdef CONFIG_MMU 16#ifdef CONFIG_MMU
@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void);
21int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) 21int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
22{ 22{
23 struct mm_struct *mm = current->active_mm; 23 struct mm_struct *mm = current->active_mm;
24 u32 __mpidr = cpu_logical_map(smp_processor_id());
24 int ret; 25 int ret;
25 26
26 if (!idmap_pgd) 27 if (!idmap_pgd)
@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
32 * resume (indicated by a zero return code), we need to switch 33 * resume (indicated by a zero return code), we need to switch
33 * back to the correct page tables. 34 * back to the correct page tables.
34 */ 35 */
35 ret = __cpu_suspend(arg, fn); 36 ret = __cpu_suspend(arg, fn, __mpidr);
36 if (ret == 0) { 37 if (ret == 0) {
37 cpu_switch_mm(mm->pgd, mm); 38 cpu_switch_mm(mm->pgd, mm);
38 local_flush_bp_all(); 39 local_flush_bp_all();
@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
44#else 45#else
45int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) 46int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
46{ 47{
47 return __cpu_suspend(arg, fn); 48 u32 __mpidr = cpu_logical_map(smp_processor_id());
49 return __cpu_suspend(arg, fn, __mpidr);
48} 50}
49#define idmap_pgd NULL 51#define idmap_pgd NULL
50#endif 52#endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 8fcda140358d..6125f259b7b5 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -34,6 +34,7 @@
34#include <asm/unwind.h> 34#include <asm/unwind.h>
35#include <asm/tls.h> 35#include <asm/tls.h>
36#include <asm/system_misc.h> 36#include <asm/system_misc.h>
37#include <asm/opcodes.h>
37 38
38static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" }; 39static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" };
39 40
@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
341int is_valid_bugaddr(unsigned long pc) 342int is_valid_bugaddr(unsigned long pc)
342{ 343{
343#ifdef CONFIG_THUMB2_KERNEL 344#ifdef CONFIG_THUMB2_KERNEL
344 unsigned short bkpt; 345 u16 bkpt;
346 u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE);
345#else 347#else
346 unsigned long bkpt; 348 u32 bkpt;
349 u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE);
347#endif 350#endif
348 351
349 if (probe_kernel_address((unsigned *)pc, bkpt)) 352 if (probe_kernel_address((unsigned *)pc, bkpt))
350 return 0; 353 return 0;
351 354
352 return bkpt == BUG_INSTR_VALUE; 355 return bkpt == insn;
353} 356}
354 357
355#endif 358#endif
@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
402 if (processor_mode(regs) == SVC_MODE) { 405 if (processor_mode(regs) == SVC_MODE) {
403#ifdef CONFIG_THUMB2_KERNEL 406#ifdef CONFIG_THUMB2_KERNEL
404 if (thumb_mode(regs)) { 407 if (thumb_mode(regs)) {
405 instr = ((u16 *)pc)[0]; 408 instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]);
406 if (is_wide_instruction(instr)) { 409 if (is_wide_instruction(instr)) {
407 instr <<= 16; 410 u16 inst2;
408 instr |= ((u16 *)pc)[1]; 411 inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]);
412 instr = __opcode_thumb32_compose(instr, inst2);
409 } 413 }
410 } else 414 } else
411#endif 415#endif
412 instr = *(u32 *) pc; 416 instr = __mem_to_opcode_arm(*(u32 *) pc);
413 } else if (thumb_mode(regs)) { 417 } else if (thumb_mode(regs)) {
414 if (get_user(instr, (u16 __user *)pc)) 418 if (get_user(instr, (u16 __user *)pc))
415 goto die_sig; 419 goto die_sig;
420 instr = __mem_to_opcode_thumb16(instr);
416 if (is_wide_instruction(instr)) { 421 if (is_wide_instruction(instr)) {
417 unsigned int instr2; 422 unsigned int instr2;
418 if (get_user(instr2, (u16 __user *)pc+1)) 423 if (get_user(instr2, (u16 __user *)pc+1))
419 goto die_sig; 424 goto die_sig;
420 instr <<= 16; 425 instr2 = __mem_to_opcode_thumb16(instr2);
421 instr |= instr2; 426 instr = __opcode_thumb32_compose(instr, instr2);
422 } 427 }
423 } else if (get_user(instr, (u32 __user *)pc)) { 428 } else if (get_user(instr, (u32 __user *)pc)) {
429 instr = __mem_to_opcode_arm(instr);
424 goto die_sig; 430 goto die_sig;
425 } 431 }
426 432
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9c697db2787e..aea7ccb8d397 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -65,7 +65,7 @@ static bool vgic_present;
65static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) 65static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
66{ 66{
67 BUG_ON(preemptible()); 67 BUG_ON(preemptible());
68 __get_cpu_var(kvm_arm_running_vcpu) = vcpu; 68 __this_cpu_write(kvm_arm_running_vcpu, vcpu);
69} 69}
70 70
71/** 71/**
@@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
75struct kvm_vcpu *kvm_arm_get_running_vcpu(void) 75struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
76{ 76{
77 BUG_ON(preemptible()); 77 BUG_ON(preemptible());
78 return __get_cpu_var(kvm_arm_running_vcpu); 78 return __this_cpu_read(kvm_arm_running_vcpu);
79} 79}
80 80
81/** 81/**
@@ -815,7 +815,7 @@ static void cpu_init_hyp_mode(void *dummy)
815 815
816 boot_pgd_ptr = kvm_mmu_get_boot_httbr(); 816 boot_pgd_ptr = kvm_mmu_get_boot_httbr();
817 pgd_ptr = kvm_mmu_get_httbr(); 817 pgd_ptr = kvm_mmu_get_httbr();
818 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); 818 stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
819 hyp_stack_ptr = stack_page + PAGE_SIZE; 819 hyp_stack_ptr = stack_page + PAGE_SIZE;
820 vector_ptr = (unsigned long)__kvm_hyp_vector; 820 vector_ptr = (unsigned long)__kvm_hyp_vector;
821 821
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index d6408d1ee543..e0c68d5bb7dc 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -10,6 +10,11 @@ UNWIND( .fnstart )
10 and r3, r0, #31 @ Get bit offset 10 and r3, r0, #31 @ Get bit offset
11 mov r0, r0, lsr #5 11 mov r0, r0, lsr #5
12 add r1, r1, r0, lsl #2 @ Get word offset 12 add r1, r1, r0, lsl #2 @ Get word offset
13#if __LINUX_ARM_ARCH__ >= 7
14 .arch_extension mp
15 ALT_SMP(W(pldw) [r1])
16 ALT_UP(W(nop))
17#endif
13 mov r3, r2, lsl r3 18 mov r3, r2, lsl r3
141: ldrex r2, [r1] 191: ldrex r2, [r1]
15 \instr r2, r2, r3 20 \instr r2, r2, r3
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 025f742dd4df..3e58d710013c 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -18,6 +18,7 @@
18#include <linux/hardirq.h> /* for in_atomic() */ 18#include <linux/hardirq.h> /* for in_atomic() */
19#include <linux/gfp.h> 19#include <linux/gfp.h>
20#include <linux/highmem.h> 20#include <linux/highmem.h>
21#include <linux/hugetlb.h>
21#include <asm/current.h> 22#include <asm/current.h>
22#include <asm/page.h> 23#include <asm/page.h>
23 24
@@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
40 return 0; 41 return 0;
41 42
42 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
43 if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd))) 44 if (unlikely(pmd_none(*pmd)))
45 return 0;
46
47 /*
48 * A pmd can be bad if it refers to a HugeTLB or THP page.
49 *
50 * Both THP and HugeTLB pages have the same pmd layout
51 * and should not be manipulated by the pte functions.
52 *
53 * Lock the page table for the destination and check
54 * to see that it's still huge and whether or not we will
55 * need to fault on write, or if we have a splitting THP.
56 */
57 if (unlikely(pmd_thp_or_huge(*pmd))) {
58 ptl = &current->mm->page_table_lock;
59 spin_lock(ptl);
60 if (unlikely(!pmd_thp_or_huge(*pmd)
61 || pmd_hugewillfault(*pmd)
62 || pmd_trans_splitting(*pmd))) {
63 spin_unlock(ptl);
64 return 0;
65 }
66
67 *ptep = NULL;
68 *ptlp = ptl;
69 return 1;
70 }
71
72 if (unlikely(pmd_bad(*pmd)))
44 return 0; 73 return 0;
45 74
46 pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); 75 pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
@@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
94 from += tocopy; 123 from += tocopy;
95 n -= tocopy; 124 n -= tocopy;
96 125
97 pte_unmap_unlock(pte, ptl); 126 if (pte)
127 pte_unmap_unlock(pte, ptl);
128 else
129 spin_unlock(ptl);
98 } 130 }
99 if (!atomic) 131 if (!atomic)
100 up_read(&current->mm->mmap_sem); 132 up_read(&current->mm->mmap_sem);
@@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
147 addr += tocopy; 179 addr += tocopy;
148 n -= tocopy; 180 n -= tocopy;
149 181
150 pte_unmap_unlock(pte, ptl); 182 if (pte)
183 pte_unmap_unlock(pte, ptl);
184 else
185 spin_unlock(ptl);
151 } 186 }
152 up_read(&current->mm->mmap_sem); 187 up_read(&current->mm->mmap_sem);
153 188
diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c
index 1fd2cf097e30..eb1fa5c84723 100644
--- a/arch/arm/mach-footbridge/netwinder-hw.c
+++ b/arch/arm/mach-footbridge/netwinder-hw.c
@@ -692,14 +692,14 @@ static void netwinder_led_set(struct led_classdev *cdev,
692 unsigned long flags; 692 unsigned long flags;
693 u32 reg; 693 u32 reg;
694 694
695 spin_lock_irqsave(&nw_gpio_lock, flags); 695 raw_spin_lock_irqsave(&nw_gpio_lock, flags);
696 reg = nw_gpio_read(); 696 reg = nw_gpio_read();
697 if (b != LED_OFF) 697 if (b != LED_OFF)
698 reg &= ~led->mask; 698 reg &= ~led->mask;
699 else 699 else
700 reg |= led->mask; 700 reg |= led->mask;
701 nw_gpio_modify_op(led->mask, reg); 701 nw_gpio_modify_op(led->mask, reg);
702 spin_unlock_irqrestore(&nw_gpio_lock, flags); 702 raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
703} 703}
704 704
705static enum led_brightness netwinder_led_get(struct led_classdev *cdev) 705static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
@@ -709,9 +709,9 @@ static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
709 unsigned long flags; 709 unsigned long flags;
710 u32 reg; 710 u32 reg;
711 711
712 spin_lock_irqsave(&nw_gpio_lock, flags); 712 raw_spin_lock_irqsave(&nw_gpio_lock, flags);
713 reg = nw_gpio_read(); 713 reg = nw_gpio_read();
714 spin_unlock_irqrestore(&nw_gpio_lock, flags); 714 raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
715 715
716 return (reg & led->mask) ? LED_OFF : LED_FULL; 716 return (reg & led->mask) ? LED_OFF : LED_FULL;
717} 717}
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index fe98df44579c..08332d841440 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -4,11 +4,12 @@ config ARCH_HIGHBANK
4 select ARCH_HAS_CPUFREQ 4 select ARCH_HAS_CPUFREQ
5 select ARCH_HAS_HOLES_MEMORYMODEL 5 select ARCH_HAS_HOLES_MEMORYMODEL
6 select ARCH_HAS_OPP 6 select ARCH_HAS_OPP
7 select ARCH_SUPPORTS_BIG_ENDIAN
7 select ARCH_WANT_OPTIONAL_GPIOLIB 8 select ARCH_WANT_OPTIONAL_GPIOLIB
8 select ARM_AMBA 9 select ARM_AMBA
9 select ARM_ERRATA_764369 10 select ARM_ERRATA_764369
10 select ARM_ERRATA_775420 11 select ARM_ERRATA_775420
11 select ARM_ERRATA_798181 12 select ARM_ERRATA_798181 if SMP
12 select ARM_GIC 13 select ARM_GIC
13 select ARM_PSCI 14 select ARM_PSCI
14 select ARM_TIMER_SP804 15 select ARM_TIMER_SP804
diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig
index 30e1ebe3a891..c342dc4e8a45 100644
--- a/arch/arm/mach-ixp4xx/Kconfig
+++ b/arch/arm/mach-ixp4xx/Kconfig
@@ -1,9 +1,5 @@
1if ARCH_IXP4XX 1if ARCH_IXP4XX
2 2
3config ARCH_SUPPORTS_BIG_ENDIAN
4 bool
5 default y
6
7menu "Intel IXP4xx Implementation Options" 3menu "Intel IXP4xx Implementation Options"
8 4
9comment "IXP4xx Platforms" 5comment "IXP4xx Platforms"
diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index 9eb63d724602..5e269d7263ce 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -1,5 +1,6 @@
1config ARCH_MVEBU 1config ARCH_MVEBU
2 bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7 2 bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7
3 select ARCH_SUPPORTS_BIG_ENDIAN
3 select CLKSRC_MMIO 4 select CLKSRC_MMIO
4 select COMMON_CLK 5 select COMMON_CLK
5 select GENERIC_CLOCKEVENTS 6 select GENERIC_CLOCKEVENTS
diff --git a/arch/arm/mach-mvebu/coherency_ll.S b/arch/arm/mach-mvebu/coherency_ll.S
index 5476669ba905..ee7598fe75db 100644
--- a/arch/arm/mach-mvebu/coherency_ll.S
+++ b/arch/arm/mach-mvebu/coherency_ll.S
@@ -20,6 +20,8 @@
20#define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0 20#define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0
21#define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4 21#define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4
22 22
23#include <asm/assembler.h>
24
23 .text 25 .text
24/* 26/*
25 * r0: Coherency fabric base register address 27 * r0: Coherency fabric base register address
@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent)
29 /* Create bit by cpu index */ 31 /* Create bit by cpu index */
30 mov r3, #(1 << 24) 32 mov r3, #(1 << 24)
31 lsl r1, r3, r1 33 lsl r1, r3, r1
34ARM_BE8(rev r1, r1)
32 35
33 /* Add CPU to SMP group - Atomic */ 36 /* Add CPU to SMP group - Atomic */
34 add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET 37 add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET
diff --git a/arch/arm/mach-mvebu/headsmp.S b/arch/arm/mach-mvebu/headsmp.S
index 8a1b0c96e9ec..3dd80df428f7 100644
--- a/arch/arm/mach-mvebu/headsmp.S
+++ b/arch/arm/mach-mvebu/headsmp.S
@@ -21,12 +21,16 @@
21#include <linux/linkage.h> 21#include <linux/linkage.h>
22#include <linux/init.h> 22#include <linux/init.h>
23 23
24#include <asm/assembler.h>
25
24/* 26/*
25 * Armada XP specific entry point for secondary CPUs. 27 * Armada XP specific entry point for secondary CPUs.
26 * We add the CPU to the coherency fabric and then jump to secondary 28 * We add the CPU to the coherency fabric and then jump to secondary
27 * startup 29 * startup
28 */ 30 */
29ENTRY(armada_xp_secondary_startup) 31ENTRY(armada_xp_secondary_startup)
32 ARM_BE8(setend be ) @ go BE8 if entered LE
33
30 /* Get coherency fabric base physical address */ 34 /* Get coherency fabric base physical address */
31 adr r0, 1f 35 adr r0, 1f
32 ldr r1, [r0] 36 ldr r1, [r0]
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index e838ba27e443..c9808c684152 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -512,6 +512,9 @@ static void __init assabet_map_io(void)
512 * Its called GPCLKR0 in my SA1110 manual. 512 * Its called GPCLKR0 in my SA1110 manual.
513 */ 513 */
514 Ser1SDCR0 |= SDCR0_SUS; 514 Ser1SDCR0 |= SDCR0_SUS;
515 MSC1 = (MSC1 & ~0xffff) |
516 MSC_NonBrst | MSC_32BitStMem |
517 MSC_RdAcc(2) | MSC_WrAcc(2) | MSC_Rec(0);
515 518
516 if (!machine_has_neponset()) 519 if (!machine_has_neponset())
517 sa1100_register_uart_fns(&assabet_port_fns); 520 sa1100_register_uart_fns(&assabet_port_fns);
diff --git a/arch/arm/mach-sa1100/include/mach/gpio.h b/arch/arm/mach-sa1100/include/mach/gpio.h
deleted file mode 100644
index 6a9eecf3137e..000000000000
--- a/arch/arm/mach-sa1100/include/mach/gpio.h
+++ /dev/null
@@ -1,55 +0,0 @@
1/*
2 * arch/arm/mach-sa1100/include/mach/gpio.h
3 *
4 * SA1100 GPIO wrappers for arch-neutral GPIO calls
5 *
6 * Written by Philipp Zabel <philipp.zabel@gmail.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 */
23
24#ifndef __ASM_ARCH_SA1100_GPIO_H
25#define __ASM_ARCH_SA1100_GPIO_H
26
27#include <linux/io.h>
28#include <mach/hardware.h>
29#include <asm/irq.h>
30#include <asm-generic/gpio.h>
31
32#define __ARM_GPIOLIB_COMPLEX
33
34static inline int gpio_get_value(unsigned gpio)
35{
36 if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
37 return GPLR & GPIO_GPIO(gpio);
38 else
39 return __gpio_get_value(gpio);
40}
41
42static inline void gpio_set_value(unsigned gpio, int value)
43{
44 if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
45 if (value)
46 GPSR = GPIO_GPIO(gpio);
47 else
48 GPCR = GPIO_GPIO(gpio);
49 else
50 __gpio_set_value(gpio, value);
51}
52
53#define gpio_cansleep __gpio_cansleep
54
55#endif
diff --git a/arch/arm/mach-sa1100/include/mach/h3xxx.h b/arch/arm/mach-sa1100/include/mach/h3xxx.h
index 7d9df16f04a2..c810620db53d 100644
--- a/arch/arm/mach-sa1100/include/mach/h3xxx.h
+++ b/arch/arm/mach-sa1100/include/mach/h3xxx.h
@@ -13,6 +13,8 @@
13#ifndef _INCLUDE_H3XXX_H_ 13#ifndef _INCLUDE_H3XXX_H_
14#define _INCLUDE_H3XXX_H_ 14#define _INCLUDE_H3XXX_H_
15 15
16#include "hardware.h" /* Gives GPIO_MAX */
17
16/* Physical memory regions corresponding to chip selects */ 18/* Physical memory regions corresponding to chip selects */
17#define H3600_EGPIO_PHYS (SA1100_CS5_PHYS + 0x01000000) 19#define H3600_EGPIO_PHYS (SA1100_CS5_PHYS + 0x01000000)
18#define H3600_BANK_2_PHYS SA1100_CS2_PHYS 20#define H3600_BANK_2_PHYS SA1100_CS2_PHYS
diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c
index bcbc94540e45..41e476e571d7 100644
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -19,6 +19,7 @@
19 19
20#include <mach/hardware.h> 20#include <mach/hardware.h>
21#include <asm/setup.h> 21#include <asm/setup.h>
22#include <asm/irq.h>
22 23
23#include <asm/mach-types.h> 24#include <asm/mach-types.h>
24#include <asm/mach/arch.h> 25#include <asm/mach/arch.h>
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig
index 0bf04a0bca9d..09e740f58b27 100644
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -51,7 +51,7 @@ config ARCH_TEGRA_3x_SOC
51 51
52config ARCH_TEGRA_114_SOC 52config ARCH_TEGRA_114_SOC
53 bool "Enable support for Tegra114 family" 53 bool "Enable support for Tegra114 family"
54 select ARM_ERRATA_798181 54 select ARM_ERRATA_798181 if SMP
55 select ARM_L1_CACHE_SHIFT_6 55 select ARM_L1_CACHE_SHIFT_6
56 select HAVE_ARM_ARCH_TIMER 56 select HAVE_ARM_ARCH_TIMER
57 select PINCTRL_TEGRA114 57 select PINCTRL_TEGRA114
diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig
index d7e7422527ca..cbbb81e0e509 100644
--- a/arch/arm/mach-vexpress/Kconfig
+++ b/arch/arm/mach-vexpress/Kconfig
@@ -1,6 +1,7 @@
1config ARCH_VEXPRESS 1config ARCH_VEXPRESS
2 bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7 2 bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7
3 select ARCH_REQUIRE_GPIOLIB 3 select ARCH_REQUIRE_GPIOLIB
4 select ARCH_SUPPORTS_BIG_ENDIAN
4 select ARM_AMBA 5 select ARM_AMBA
5 select ARM_GIC 6 select ARM_GIC
6 select ARM_TIMER_SP804 7 select ARM_TIMER_SP804
diff --git a/arch/arm/mach-vexpress/dcscb.c b/arch/arm/mach-vexpress/dcscb.c
index 3a6384c6c435..14d499688736 100644
--- a/arch/arm/mach-vexpress/dcscb.c
+++ b/arch/arm/mach-vexpress/dcscb.c
@@ -133,38 +133,8 @@ static void dcscb_power_down(void)
133 if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) { 133 if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
134 arch_spin_unlock(&dcscb_lock); 134 arch_spin_unlock(&dcscb_lock);
135 135
136 /* 136 /* Flush all cache levels for this cluster. */
137 * Flush all cache levels for this cluster. 137 v7_exit_coherency_flush(all);
138 *
139 * To do so we do:
140 * - Clear the SCTLR.C bit to prevent further cache allocations
141 * - Flush the whole cache
142 * - Clear the ACTLR "SMP" bit to disable local coherency
143 *
144 * Let's do it in the safest possible way i.e. with
145 * no memory access within the following sequence
146 * including to the stack.
147 *
148 * Note: fp is preserved to the stack explicitly prior doing
149 * this since adding it to the clobber list is incompatible
150 * with having CONFIG_FRAME_POINTER=y.
151 */
152 asm volatile(
153 "str fp, [sp, #-4]! \n\t"
154 "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t"
155 "bic r0, r0, #"__stringify(CR_C)" \n\t"
156 "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t"
157 "isb \n\t"
158 "bl v7_flush_dcache_all \n\t"
159 "clrex \n\t"
160 "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t"
161 "bic r0, r0, #(1 << 6) @ disable local coherency \n\t"
162 "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t"
163 "isb \n\t"
164 "dsb \n\t"
165 "ldr fp, [sp], #4"
166 : : : "r0","r1","r2","r3","r4","r5","r6","r7",
167 "r9","r10","lr","memory");
168 138
169 /* 139 /*
170 * This is a harmless no-op. On platforms with a real 140 * This is a harmless no-op. On platforms with a real
@@ -183,26 +153,8 @@ static void dcscb_power_down(void)
183 } else { 153 } else {
184 arch_spin_unlock(&dcscb_lock); 154 arch_spin_unlock(&dcscb_lock);
185 155
186 /* 156 /* Disable and flush the local CPU cache. */
187 * Flush the local CPU cache. 157 v7_exit_coherency_flush(louis);
188 * Let's do it in the safest possible way as above.
189 */
190 asm volatile(
191 "str fp, [sp, #-4]! \n\t"
192 "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t"
193 "bic r0, r0, #"__stringify(CR_C)" \n\t"
194 "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t"
195 "isb \n\t"
196 "bl v7_flush_dcache_louis \n\t"
197 "clrex \n\t"
198 "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t"
199 "bic r0, r0, #(1 << 6) @ disable local coherency \n\t"
200 "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t"
201 "isb \n\t"
202 "dsb \n\t"
203 "ldr fp, [sp], #4"
204 : : : "r0","r1","r2","r3","r4","r5","r6","r7",
205 "r9","r10","lr","memory");
206 } 158 }
207 159
208 __mcpm_cpu_down(cpu, cluster); 160 __mcpm_cpu_down(cpu, cluster);
diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c
index e6eb48192912..4eb92ebfd953 100644
--- a/arch/arm/mach-vexpress/tc2_pm.c
+++ b/arch/arm/mach-vexpress/tc2_pm.c
@@ -156,32 +156,7 @@ static void tc2_pm_down(u64 residency)
156 : : "r" (0x400) ); 156 : : "r" (0x400) );
157 } 157 }
158 158
159 /* 159 v7_exit_coherency_flush(all);
160 * We need to disable and flush the whole (L1 and L2) cache.
161 * Let's do it in the safest possible way i.e. with
162 * no memory access within the following sequence
163 * including the stack.
164 *
165 * Note: fp is preserved to the stack explicitly prior doing
166 * this since adding it to the clobber list is incompatible
167 * with having CONFIG_FRAME_POINTER=y.
168 */
169 asm volatile(
170 "str fp, [sp, #-4]! \n\t"
171 "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t"
172 "bic r0, r0, #"__stringify(CR_C)" \n\t"
173 "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t"
174 "isb \n\t"
175 "bl v7_flush_dcache_all \n\t"
176 "clrex \n\t"
177 "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t"
178 "bic r0, r0, #(1 << 6) @ disable local coherency \n\t"
179 "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t"
180 "isb \n\t"
181 "dsb \n\t"
182 "ldr fp, [sp], #4"
183 : : : "r0","r1","r2","r3","r4","r5","r6","r7",
184 "r9","r10","lr","memory");
185 160
186 cci_disable_port_by_cpu(mpidr); 161 cci_disable_port_by_cpu(mpidr);
187 162
@@ -197,26 +172,7 @@ static void tc2_pm_down(u64 residency)
197 172
198 arch_spin_unlock(&tc2_pm_lock); 173 arch_spin_unlock(&tc2_pm_lock);
199 174
200 /* 175 v7_exit_coherency_flush(louis);
201 * We need to disable and flush only the L1 cache.
202 * Let's do it in the safest possible way as above.
203 */
204 asm volatile(
205 "str fp, [sp, #-4]! \n\t"
206 "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t"
207 "bic r0, r0, #"__stringify(CR_C)" \n\t"
208 "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t"
209 "isb \n\t"
210 "bl v7_flush_dcache_louis \n\t"
211 "clrex \n\t"
212 "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t"
213 "bic r0, r0, #(1 << 6) @ disable local coherency \n\t"
214 "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t"
215 "isb \n\t"
216 "dsb \n\t"
217 "ldr fp, [sp], #4"
218 : : : "r0","r1","r2","r3","r4","r5","r6","r7",
219 "r9","r10","lr","memory");
220 } 176 }
221 177
222 __mcpm_cpu_down(cpu, cluster); 178 __mcpm_cpu_down(cpu, cluster);
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index cd2c88e7a8f7..1f8fed94c2a4 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS
952 help 952 help
953 This option allows the use of custom mandatory barriers 953 This option allows the use of custom mandatory barriers
954 included via the mach/barriers.h file. 954 included via the mach/barriers.h file.
955
956config ARCH_SUPPORTS_BIG_ENDIAN
957 bool
958 help
959 This option specifies the architecture can support big endian
960 operation.
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 80741992a9fc..3815a8262af0 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
38 bne do_DataAbort 38 bne do_DataAbort
39 bic r1, r1, #1 << 11 @ clear bit 11 of FSR 39 bic r1, r1, #1 << 11 @ clear bit 11 of FSR
40 ldr r3, [r4] @ read aborted ARM instruction 40 ldr r3, [r4] @ read aborted ARM instruction
41#ifdef CONFIG_CPU_ENDIAN_BE8 41 ARM_BE8(rev r3, r3)
42 rev r3, r3 42
43#endif
44 do_ldrd_abort tmp=ip, insn=r3 43 do_ldrd_abort tmp=ip, insn=r3
45 tst r3, #1 << 20 @ L = 0 -> write 44 tst r3, #1 << 20 @ L = 0 -> write
46 orreq r1, r1, #1 << 11 @ yes. 45 orreq r1, r1, #1 << 11 @ yes.
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 6f4585b89078..924036473b16 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -25,6 +25,7 @@
25#include <asm/cp15.h> 25#include <asm/cp15.h>
26#include <asm/system_info.h> 26#include <asm/system_info.h>
27#include <asm/unaligned.h> 27#include <asm/unaligned.h>
28#include <asm/opcodes.h>
28 29
29#include "fault.h" 30#include "fault.h"
30 31
@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
762 if (thumb_mode(regs)) { 763 if (thumb_mode(regs)) {
763 u16 *ptr = (u16 *)(instrptr & ~1); 764 u16 *ptr = (u16 *)(instrptr & ~1);
764 fault = probe_kernel_address(ptr, tinstr); 765 fault = probe_kernel_address(ptr, tinstr);
766 tinstr = __mem_to_opcode_thumb16(tinstr);
765 if (!fault) { 767 if (!fault) {
766 if (cpu_architecture() >= CPU_ARCH_ARMv7 && 768 if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
767 IS_T32(tinstr)) { 769 IS_T32(tinstr)) {
768 /* Thumb-2 32-bit */ 770 /* Thumb-2 32-bit */
769 u16 tinst2 = 0; 771 u16 tinst2 = 0;
770 fault = probe_kernel_address(ptr + 1, tinst2); 772 fault = probe_kernel_address(ptr + 1, tinst2);
771 instr = (tinstr << 16) | tinst2; 773 tinst2 = __mem_to_opcode_thumb16(tinst2);
774 instr = __opcode_thumb32_compose(tinstr, tinst2);
772 thumb2_32b = 1; 775 thumb2_32b = 1;
773 } else { 776 } else {
774 isize = 2; 777 isize = 2;
775 instr = thumb2arm(tinstr); 778 instr = thumb2arm(tinstr);
776 } 779 }
777 } 780 }
778 } else 781 } else {
779 fault = probe_kernel_address(instrptr, instr); 782 fault = probe_kernel_address(instrptr, instr);
783 instr = __mem_to_opcode_arm(instr);
784 }
780 785
781 if (fault) { 786 if (fault) {
782 type = TYPE_FAULT; 787 type = TYPE_FAULT;
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 644d91f73b00..79f8b39801a8 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -707,7 +707,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
707void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, 707void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
708 gfp_t gfp, struct dma_attrs *attrs) 708 gfp_t gfp, struct dma_attrs *attrs)
709{ 709{
710 pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel); 710 pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
711 void *memory; 711 void *memory;
712 712
713 if (dma_alloc_from_coherent(dev, size, handle, &memory)) 713 if (dma_alloc_from_coherent(dev, size, handle, &memory))
@@ -720,7 +720,7 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
720static void *arm_coherent_dma_alloc(struct device *dev, size_t size, 720static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
721 dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) 721 dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
722{ 722{
723 pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel); 723 pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
724 void *memory; 724 void *memory;
725 725
726 if (dma_alloc_from_coherent(dev, size, handle, &memory)) 726 if (dma_alloc_from_coherent(dev, size, handle, &memory))
diff --git a/arch/arm/mm/extable.c b/arch/arm/mm/extable.c
index 9d285626bc7d..312e15e6d00b 100644
--- a/arch/arm/mm/extable.c
+++ b/arch/arm/mm/extable.c
@@ -9,8 +9,13 @@ int fixup_exception(struct pt_regs *regs)
9 const struct exception_table_entry *fixup; 9 const struct exception_table_entry *fixup;
10 10
11 fixup = search_exception_tables(instruction_pointer(regs)); 11 fixup = search_exception_tables(instruction_pointer(regs));
12 if (fixup) 12 if (fixup) {
13 regs->ARM_pc = fixup->fixup; 13 regs->ARM_pc = fixup->fixup;
14#ifdef CONFIG_THUMB2_KERNEL
15 /* Clear the IT state to avoid nasty surprises in the fixup */
16 regs->ARM_cpsr &= ~PSR_IT_MASK;
17#endif
18 }
14 19
15 return fixup != NULL; 20 return fixup != NULL;
16} 21}
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 83cb3ac27095..8e0e52eb76b5 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -10,6 +10,7 @@
10#include <asm/system_info.h> 10#include <asm/system_info.h>
11 11
12pgd_t *idmap_pgd; 12pgd_t *idmap_pgd;
13phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
13 14
14#ifdef CONFIG_ARM_LPAE 15#ifdef CONFIG_ARM_LPAE
15static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, 16static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
67 unsigned long addr, end; 68 unsigned long addr, end;
68 unsigned long next; 69 unsigned long next;
69 70
70 addr = virt_to_phys(text_start); 71 addr = virt_to_idmap(text_start);
71 end = virt_to_phys(text_end); 72 end = virt_to_idmap(text_end);
73 pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
72 74
73 prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; 75 prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
74 76
@@ -90,8 +92,6 @@ static int __init init_static_idmap(void)
90 if (!idmap_pgd) 92 if (!idmap_pgd)
91 return -ENOMEM; 93 return -ENOMEM;
92 94
93 pr_info("Setting up static identity map for 0x%p - 0x%p\n",
94 __idmap_text_start, __idmap_text_end);
95 identity_mapping_add(idmap_pgd, __idmap_text_start, 95 identity_mapping_add(idmap_pgd, __idmap_text_start,
96 __idmap_text_end, 0); 96 __idmap_text_end, 0);
97 97
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 0c6356255fe3..d27158c38eb0 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -202,13 +202,11 @@ int valid_phys_addr_range(phys_addr_t addr, size_t size)
202} 202}
203 203
204/* 204/*
205 * We don't use supersection mappings for mmap() on /dev/mem, which 205 * Do not allow /dev/mem mappings beyond the supported physical range.
206 * means that we can't map the memory area above the 4G barrier into
207 * userspace.
208 */ 206 */
209int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) 207int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
210{ 208{
211 return !(pfn + (size >> PAGE_SHIFT) > 0x00100000); 209 return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT));
212} 210}
213 211
214#ifdef CONFIG_STRICT_DEVMEM 212#ifdef CONFIG_STRICT_DEVMEM
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index b1d17eeb59b8..78eeeca78f5a 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -28,6 +28,8 @@
28#include <asm/highmem.h> 28#include <asm/highmem.h>
29#include <asm/system_info.h> 29#include <asm/system_info.h>
30#include <asm/traps.h> 30#include <asm/traps.h>
31#include <asm/procinfo.h>
32#include <asm/memory.h>
31 33
32#include <asm/mach/arch.h> 34#include <asm/mach/arch.h>
33#include <asm/mach/map.h> 35#include <asm/mach/map.h>
@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void)
1315 } 1317 }
1316} 1318}
1317 1319
1320#ifdef CONFIG_ARM_LPAE
1321/*
1322 * early_paging_init() recreates boot time page table setup, allowing machines
1323 * to switch over to a high (>4G) address space on LPAE systems
1324 */
1325void __init early_paging_init(const struct machine_desc *mdesc,
1326 struct proc_info_list *procinfo)
1327{
1328 pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
1329 unsigned long map_start, map_end;
1330 pgd_t *pgd0, *pgdk;
1331 pud_t *pud0, *pudk, *pud_start;
1332 pmd_t *pmd0, *pmdk;
1333 phys_addr_t phys;
1334 int i;
1335
1336 if (!(mdesc->init_meminfo))
1337 return;
1338
1339 /* remap kernel code and data */
1340 map_start = init_mm.start_code;
1341 map_end = init_mm.brk;
1342
1343 /* get a handle on things... */
1344 pgd0 = pgd_offset_k(0);
1345 pud_start = pud0 = pud_offset(pgd0, 0);
1346 pmd0 = pmd_offset(pud0, 0);
1347
1348 pgdk = pgd_offset_k(map_start);
1349 pudk = pud_offset(pgdk, map_start);
1350 pmdk = pmd_offset(pudk, map_start);
1351
1352 mdesc->init_meminfo();
1353
1354 /* Run the patch stub to update the constants */
1355 fixup_pv_table(&__pv_table_begin,
1356 (&__pv_table_end - &__pv_table_begin) << 2);
1357
1358 /*
1359 * Cache cleaning operations for self-modifying code
1360 * We should clean the entries by MVA but running a
1361 * for loop over every pv_table entry pointer would
1362 * just complicate the code.
1363 */
1364 flush_cache_louis();
1365 dsb();
1366 isb();
1367
1368 /* remap level 1 table */
1369 for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
1370 set_pud(pud0,
1371 __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
1372 pmd0 += PTRS_PER_PMD;
1373 }
1374
1375 /* remap pmds for kernel mapping */
1376 phys = __pa(map_start) & PMD_MASK;
1377 do {
1378 *pmdk++ = __pmd(phys | pmdprot);
1379 phys += PMD_SIZE;
1380 } while (phys < map_end);
1381
1382 flush_cache_all();
1383 cpu_switch_mm(pgd0, &init_mm);
1384 cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
1385 local_flush_bp_all();
1386 local_flush_tlb_all();
1387}
1388
1389#else
1390
1391void __init early_paging_init(const struct machine_desc *mdesc,
1392 struct proc_info_list *procinfo)
1393{
1394 if (mdesc->init_meminfo)
1395 mdesc->init_meminfo();
1396}
1397
1398#endif
1399
1318/* 1400/*
1319 * paging_init() sets up the page tables, initialises the zone memory 1401 * paging_init() sets up the page tables, initialises the zone memory
1320 * maps, and sets up the zero page, bad page and bad page tables. 1402 * maps, and sets up the zero page, bad page and bad page tables.
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 34d4ab217bab..5c668b7a31f9 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -296,6 +296,15 @@ void __init sanity_check_meminfo(void)
296} 296}
297 297
298/* 298/*
299 * early_paging_init() recreates boot time page table setup, allowing machines
300 * to switch over to a high (>4G) address space on LPAE systems
301 */
302void __init early_paging_init(const struct machine_desc *mdesc,
303 struct proc_info_list *procinfo)
304{
305}
306
307/*
299 * paging_init() sets up the page tables, initialises the zone memory 308 * paging_init() sets up the page tables, initialises the zone memory
300 * maps, and sets up the zero page, bad page and bad page tables. 309 * maps, and sets up the zero page, bad page and bad page tables.
301 */ 310 */
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S
index 1128064fddcb..45dc29f85d56 100644
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -220,9 +220,7 @@ __v6_setup:
220#endif /* CONFIG_MMU */ 220#endif /* CONFIG_MMU */
221 adr r5, v6_crval 221 adr r5, v6_crval
222 ldmia r5, {r5, r6} 222 ldmia r5, {r5, r6}
223#ifdef CONFIG_CPU_ENDIAN_BE8 223 ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables
224 orr r6, r6, #1 << 25 @ big-endian page tables
225#endif
226 mrc p15, 0, r0, c1, c0, 0 @ read control register 224 mrc p15, 0, r0, c1, c0, 0 @ read control register
227 bic r0, r0, r5 @ clear bits them 225 bic r0, r0, r5 @ clear bits them
228 orr r0, r0, r6 @ set them 226 orr r0, r0, r6 @ set them
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index c63d9bdee51e..60920f62fdf5 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -367,9 +367,7 @@ __v7_setup:
367#endif 367#endif
368 adr r5, v7_crval 368 adr r5, v7_crval
369 ldmia r5, {r5, r6} 369 ldmia r5, {r5, r6}
370#ifdef CONFIG_CPU_ENDIAN_BE8 370 ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables
371 orr r6, r6, #1 << 25 @ big-endian page tables
372#endif
373#ifdef CONFIG_SWP_EMULATE 371#ifdef CONFIG_SWP_EMULATE
374 orr r5, r5, #(1 << 10) @ set SW bit in "clear" 372 orr r5, r5, #(1 << 10) @ set SW bit in "clear"
375 bic r6, r6, #(1 << 10) @ clear it in "mmuset" 373 bic r6, r6, #(1 << 10) @ clear it in "mmuset"
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 99b44e0e8d86..9ed155ad0f97 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -19,6 +19,7 @@
19#include <linux/if_vlan.h> 19#include <linux/if_vlan.h>
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/hwcap.h> 21#include <asm/hwcap.h>
22#include <asm/opcodes.h>
22 23
23#include "bpf_jit_32.h" 24#include "bpf_jit_32.h"
24 25
@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor)
113 114
114static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) 115static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
115{ 116{
117 inst |= (cond << 28);
118 inst = __opcode_to_mem_arm(inst);
119
116 if (ctx->target != NULL) 120 if (ctx->target != NULL)
117 ctx->target[ctx->idx] = inst | (cond << 28); 121 ctx->target[ctx->idx] = inst;
118 122
119 ctx->idx++; 123 ctx->idx++;
120} 124}
diff --git a/arch/arm/plat-versatile/headsmp.S b/arch/arm/plat-versatile/headsmp.S
index 2677bc3762d7..40f27e52de75 100644
--- a/arch/arm/plat-versatile/headsmp.S
+++ b/arch/arm/plat-versatile/headsmp.S
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/linkage.h> 11#include <linux/linkage.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <asm/assembler.h>
13 14
14/* 15/*
15 * Realview/Versatile Express specific entry point for secondary CPUs. 16 * Realview/Versatile Express specific entry point for secondary CPUs.
@@ -17,6 +18,7 @@
17 * until we're ready for them to initialise. 18 * until we're ready for them to initialise.
18 */ 19 */
19ENTRY(versatile_secondary_startup) 20ENTRY(versatile_secondary_startup)
21 ARM_BE8(setend be)
20 mrc p15, 0, r0, c0, c0, 5 22 mrc p15, 0, r0, c0, c0, 5
21 bic r0, #0xff000000 23 bic r0, #0xff000000
22 adr r4, 1f 24 adr r4, 1f
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 52b8f40b1c73..2f37e1d6cb45 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -642,9 +642,9 @@ int vfp_restore_user_hwstate(struct user_vfp __user *ufp,
642static int vfp_hotplug(struct notifier_block *b, unsigned long action, 642static int vfp_hotplug(struct notifier_block *b, unsigned long action,
643 void *hcpu) 643 void *hcpu)
644{ 644{
645 if (action == CPU_DYING || action == CPU_DYING_FROZEN) { 645 if (action == CPU_DYING || action == CPU_DYING_FROZEN)
646 vfp_force_reload((long)hcpu, current_thread_info()); 646 vfp_current_hw_state[(long)hcpu] = NULL;
647 } else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN) 647 else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
648 vfp_enable(NULL); 648 vfp_enable(NULL);
649 return NOTIFY_OK; 649 return NOTIFY_OK;
650} 650}
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 836364468571..01de5aaa3edc 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -126,20 +126,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
126 return oldval; 126 return oldval;
127} 127}
128 128
129static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
130{
131 unsigned long tmp, tmp2;
132
133 asm volatile("// atomic_clear_mask\n"
134"1: ldxr %0, %2\n"
135" bic %0, %0, %3\n"
136" stxr %w1, %0, %2\n"
137" cbnz %w1, 1b"
138 : "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr)
139 : "Ir" (mask)
140 : "cc");
141}
142
143#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) 129#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
144 130
145static inline int __atomic_add_unless(atomic_t *v, int a, int u) 131static inline int __atomic_add_unless(atomic_t *v, int a, int u)
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index cbfacf7fb438..6a0a9b132d7a 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -27,7 +27,6 @@
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28 28
29#include <asm/debug-monitors.h> 29#include <asm/debug-monitors.h>
30#include <asm/local.h>
31#include <asm/cputype.h> 30#include <asm/cputype.h>
32#include <asm/system_misc.h> 31#include <asm/system_misc.h>
33 32
@@ -89,8 +88,8 @@ early_param("nodebugmon", early_debug_disable);
89 * Keep track of debug users on each core. 88 * Keep track of debug users on each core.
90 * The ref counts are per-cpu so we use a local_t type. 89 * The ref counts are per-cpu so we use a local_t type.
91 */ 90 */
92static DEFINE_PER_CPU(local_t, mde_ref_count); 91static DEFINE_PER_CPU(int, mde_ref_count);
93static DEFINE_PER_CPU(local_t, kde_ref_count); 92static DEFINE_PER_CPU(int, kde_ref_count);
94 93
95void enable_debug_monitors(enum debug_el el) 94void enable_debug_monitors(enum debug_el el)
96{ 95{
@@ -98,11 +97,11 @@ void enable_debug_monitors(enum debug_el el)
98 97
99 WARN_ON(preemptible()); 98 WARN_ON(preemptible());
100 99
101 if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1) 100 if (this_cpu_inc_return(mde_ref_count) == 1)
102 enable = DBG_MDSCR_MDE; 101 enable = DBG_MDSCR_MDE;
103 102
104 if (el == DBG_ACTIVE_EL1 && 103 if (el == DBG_ACTIVE_EL1 &&
105 local_inc_return(&__get_cpu_var(kde_ref_count)) == 1) 104 this_cpu_inc_return(kde_ref_count) == 1)
106 enable |= DBG_MDSCR_KDE; 105 enable |= DBG_MDSCR_KDE;
107 106
108 if (enable && debug_enabled) { 107 if (enable && debug_enabled) {
@@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_el el)
118 117
119 WARN_ON(preemptible()); 118 WARN_ON(preemptible());
120 119
121 if (local_dec_and_test(&__get_cpu_var(mde_ref_count))) 120 if (this_cpu_dec_return(mde_ref_count) == 0)
122 disable = ~DBG_MDSCR_MDE; 121 disable = ~DBG_MDSCR_MDE;
123 122
124 if (el == DBG_ACTIVE_EL1 && 123 if (el == DBG_ACTIVE_EL1 &&
125 local_dec_and_test(&__get_cpu_var(kde_ref_count))) 124 this_cpu_dec_return(kde_ref_count) == 0)
126 disable &= ~DBG_MDSCR_KDE; 125 disable &= ~DBG_MDSCR_KDE;
127 126
128 if (disable) { 127 if (disable) {
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 329218ca9ffb..ff516f6691e4 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
184 /* Breakpoint */ 184 /* Breakpoint */
185 ctrl_reg = AARCH64_DBG_REG_BCR; 185 ctrl_reg = AARCH64_DBG_REG_BCR;
186 val_reg = AARCH64_DBG_REG_BVR; 186 val_reg = AARCH64_DBG_REG_BVR;
187 slots = __get_cpu_var(bp_on_reg); 187 slots = this_cpu_ptr(bp_on_reg);
188 max_slots = core_num_brps; 188 max_slots = core_num_brps;
189 reg_enable = !debug_info->bps_disabled; 189 reg_enable = !debug_info->bps_disabled;
190 } else { 190 } else {
191 /* Watchpoint */ 191 /* Watchpoint */
192 ctrl_reg = AARCH64_DBG_REG_WCR; 192 ctrl_reg = AARCH64_DBG_REG_WCR;
193 val_reg = AARCH64_DBG_REG_WVR; 193 val_reg = AARCH64_DBG_REG_WVR;
194 slots = __get_cpu_var(wp_on_reg); 194 slots = this_cpu_ptr(wp_on_reg);
195 max_slots = core_num_wrps; 195 max_slots = core_num_wrps;
196 reg_enable = !debug_info->wps_disabled; 196 reg_enable = !debug_info->wps_disabled;
197 } 197 }
@@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
230 if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) { 230 if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
231 /* Breakpoint */ 231 /* Breakpoint */
232 base = AARCH64_DBG_REG_BCR; 232 base = AARCH64_DBG_REG_BCR;
233 slots = __get_cpu_var(bp_on_reg); 233 slots = this_cpu_ptr(bp_on_reg);
234 max_slots = core_num_brps; 234 max_slots = core_num_brps;
235 } else { 235 } else {
236 /* Watchpoint */ 236 /* Watchpoint */
237 base = AARCH64_DBG_REG_WCR; 237 base = AARCH64_DBG_REG_WCR;
238 slots = __get_cpu_var(wp_on_reg); 238 slots = this_cpu_ptr(wp_on_reg);
239 max_slots = core_num_wrps; 239 max_slots = core_num_wrps;
240 } 240 }
241 241
@@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg, enum debug_el el, int enable)
505 505
506 switch (reg) { 506 switch (reg) {
507 case AARCH64_DBG_REG_BCR: 507 case AARCH64_DBG_REG_BCR:
508 slots = __get_cpu_var(bp_on_reg); 508 slots = this_cpu_ptr(bp_on_reg);
509 max_slots = core_num_brps; 509 max_slots = core_num_brps;
510 break; 510 break;
511 case AARCH64_DBG_REG_WCR: 511 case AARCH64_DBG_REG_WCR:
512 slots = __get_cpu_var(wp_on_reg); 512 slots = this_cpu_ptr(wp_on_reg);
513 max_slots = core_num_wrps; 513 max_slots = core_num_wrps;
514 break; 514 break;
515 default: 515 default:
@@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
546 struct debug_info *debug_info; 546 struct debug_info *debug_info;
547 struct arch_hw_breakpoint_ctrl ctrl; 547 struct arch_hw_breakpoint_ctrl ctrl;
548 548
549 slots = (struct perf_event **)__get_cpu_var(bp_on_reg); 549 slots = this_cpu_ptr(bp_on_reg);
550 addr = instruction_pointer(regs); 550 addr = instruction_pointer(regs);
551 debug_info = &current->thread.debug; 551 debug_info = &current->thread.debug;
552 552
@@ -596,7 +596,7 @@ unlock:
596 user_enable_single_step(current); 596 user_enable_single_step(current);
597 } else { 597 } else {
598 toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0); 598 toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0);
599 kernel_step = &__get_cpu_var(stepping_kernel_bp); 599 kernel_step = this_cpu_ptr(&stepping_kernel_bp);
600 600
601 if (*kernel_step != ARM_KERNEL_STEP_NONE) 601 if (*kernel_step != ARM_KERNEL_STEP_NONE)
602 return 0; 602 return 0;
@@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
623 struct arch_hw_breakpoint *info; 623 struct arch_hw_breakpoint *info;
624 struct arch_hw_breakpoint_ctrl ctrl; 624 struct arch_hw_breakpoint_ctrl ctrl;
625 625
626 slots = (struct perf_event **)__get_cpu_var(wp_on_reg); 626 slots = this_cpu_ptr(wp_on_reg);
627 debug_info = &current->thread.debug; 627 debug_info = &current->thread.debug;
628 628
629 for (i = 0; i < core_num_wrps; ++i) { 629 for (i = 0; i < core_num_wrps; ++i) {
@@ -698,7 +698,7 @@ unlock:
698 user_enable_single_step(current); 698 user_enable_single_step(current);
699 } else { 699 } else {
700 toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0); 700 toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0);
701 kernel_step = &__get_cpu_var(stepping_kernel_bp); 701 kernel_step = this_cpu_ptr(&stepping_kernel_bp);
702 702
703 if (*kernel_step != ARM_KERNEL_STEP_NONE) 703 if (*kernel_step != ARM_KERNEL_STEP_NONE)
704 return 0; 704 return 0;
@@ -722,7 +722,7 @@ int reinstall_suspended_bps(struct pt_regs *regs)
722 struct debug_info *debug_info = &current->thread.debug; 722 struct debug_info *debug_info = &current->thread.debug;
723 int handled_exception = 0, *kernel_step; 723 int handled_exception = 0, *kernel_step;
724 724
725 kernel_step = &__get_cpu_var(stepping_kernel_bp); 725 kernel_step = this_cpu_ptr(&stepping_kernel_bp);
726 726
727 /* 727 /*
728 * Called from single-step exception handler. 728 * Called from single-step exception handler.
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 5d14470452ac..0e63c98d224c 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1044,7 +1044,7 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev)
1044 */ 1044 */
1045 regs = get_irq_regs(); 1045 regs = get_irq_regs();
1046 1046
1047 cpuc = &__get_cpu_var(cpu_hw_events); 1047 cpuc = this_cpu_ptr(&cpu_hw_events);
1048 for (idx = 0; idx < cpu_pmu->num_events; ++idx) { 1048 for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
1049 struct perf_event *event = cpuc->events[idx]; 1049 struct perf_event *event = cpuc->events[idx];
1050 struct hw_perf_event *hwc; 1050 struct hw_perf_event *hwc;
@@ -1258,7 +1258,7 @@ device_initcall(register_pmu_driver);
1258 1258
1259static struct pmu_hw_events *armpmu_get_cpu_events(void) 1259static struct pmu_hw_events *armpmu_get_cpu_events(void)
1260{ 1260{
1261 return &__get_cpu_var(cpu_hw_events); 1261 return this_cpu_ptr(&cpu_hw_events);
1262} 1262}
1263 1263
1264static void __init cpu_pmu_init(struct arm_pmu *armpmu) 1264static void __init cpu_pmu_init(struct arm_pmu *armpmu)
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 69ce573f1224..71f337aefa39 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM
776 776
777 See <http://csrc.nist.gov/encryption/aes/> for more information. 777 See <http://csrc.nist.gov/encryption/aes/> for more information.
778 778
779config CRYPTO_AES_ARM_BS
780 tristate "Bit sliced AES using NEON instructions"
781 depends on ARM && KERNEL_MODE_NEON
782 select CRYPTO_ALGAPI
783 select CRYPTO_AES_ARM
784 select CRYPTO_ABLK_HELPER
785 help
786 Use a faster and more secure NEON based implementation of AES in CBC,
787 CTR and XTS modes
788
789 Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
790 and for XTS mode encryption, CBC and XTS mode decryption speedup is
791 around 25%. (CBC encryption speed is not affected by this driver.)
792 This implementation does not rely on any lookup tables so it is
793 believed to be invulnerable to cache timing attacks.
794
779config CRYPTO_ANUBIS 795config CRYPTO_ANUBIS
780 tristate "Anubis cipher algorithm" 796 tristate "Anubis cipher algorithm"
781 select CRYPTO_ALGAPI 797 select CRYPTO_ALGAPI
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c
index bb5b90e8e768..b6739cb78e32 100644
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -852,7 +852,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
852 852
853 /* Enable the CCI port */ 853 /* Enable the CCI port */
854" ldr r0, [r0, %[offsetof_port_phys]] \n" 854" ldr r0, [r0, %[offsetof_port_phys]] \n"
855" mov r3, #"__stringify(CCI_ENABLE_REQ)" \n" 855" mov r3, %[cci_enable_req]\n"
856" str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n" 856" str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n"
857 857
858 /* poll the status reg for completion */ 858 /* poll the status reg for completion */
@@ -860,7 +860,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
860" ldr r0, [r1] \n" 860" ldr r0, [r1] \n"
861" ldr r0, [r0, r1] @ cci_ctrl_base \n" 861" ldr r0, [r0, r1] @ cci_ctrl_base \n"
862"4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n" 862"4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n"
863" tst r1, #1 \n" 863" tst r1, %[cci_control_status_bits] \n"
864" bne 4b \n" 864" bne 4b \n"
865 865
866" mov r0, #0 \n" 866" mov r0, #0 \n"
@@ -873,6 +873,8 @@ asmlinkage void __naked cci_enable_port_for_self(void)
873"7: .word cci_ctrl_phys - . \n" 873"7: .word cci_ctrl_phys - . \n"
874 : : 874 : :
875 [sizeof_cpu_port] "i" (sizeof(cpu_port)), 875 [sizeof_cpu_port] "i" (sizeof(cpu_port)),
876 [cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ),
877 [cci_control_status_bits] "i" cpu_to_le32(1),
876#ifndef __ARMEB__ 878#ifndef __ARMEB__
877 [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)), 879 [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)),
878#else 880#else
diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c
index 8ea3b33d4b40..a90be34e4d5c 100644
--- a/drivers/gpio/gpio-sa1100.c
+++ b/drivers/gpio/gpio-sa1100.c
@@ -10,7 +10,7 @@
10#include <linux/gpio.h> 10#include <linux/gpio.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/module.h> 12#include <linux/module.h>
13 13#include <linux/io.h>
14#include <mach/hardware.h> 14#include <mach/hardware.h>
15#include <mach/irqs.h> 15#include <mach/irqs.h>
16 16
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index d0e948084eaf..9031171c141b 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
253 if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) 253 if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
254 return -EINVAL; 254 return -EINVAL;
255 255
256 raw_spin_lock(&irq_controller_lock);
256 mask = 0xff << shift; 257 mask = 0xff << shift;
257 bit = gic_cpu_map[cpu] << shift; 258 bit = gic_cpu_map[cpu] << shift;
258
259 raw_spin_lock(&irq_controller_lock);
260 val = readl_relaxed(reg) & ~mask; 259 val = readl_relaxed(reg) & ~mask;
261 writel_relaxed(val | bit, reg); 260 writel_relaxed(val | bit, reg);
262 raw_spin_unlock(&irq_controller_lock); 261 raw_spin_unlock(&irq_controller_lock);
@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
652void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) 651void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
653{ 652{
654 int cpu; 653 int cpu;
655 unsigned long map = 0; 654 unsigned long flags, map = 0;
655
656 raw_spin_lock_irqsave(&irq_controller_lock, flags);
656 657
657 /* Convert our logical CPU mask into a physical one. */ 658 /* Convert our logical CPU mask into a physical one. */
658 for_each_cpu(cpu, mask) 659 for_each_cpu(cpu, mask)
@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
666 667
667 /* this always happens on GIC0 */ 668 /* this always happens on GIC0 */
668 writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); 669 writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
670
671 raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
672}
673#endif
674
675#ifdef CONFIG_BL_SWITCHER
676/*
677 * gic_send_sgi - send a SGI directly to given CPU interface number
678 *
679 * cpu_id: the ID for the destination CPU interface
680 * irq: the IPI number to send a SGI for
681 */
682void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
683{
684 BUG_ON(cpu_id >= NR_GIC_CPU_IF);
685 cpu_id = 1 << cpu_id;
686 /* this always happens on GIC0 */
687 writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
688}
689
690/*
691 * gic_get_cpu_id - get the CPU interface ID for the specified CPU
692 *
693 * @cpu: the logical CPU number to get the GIC ID for.
694 *
695 * Return the CPU interface ID for the given logical CPU number,
696 * or -1 if the CPU number is too large or the interface ID is
697 * unknown (more than one bit set).
698 */
699int gic_get_cpu_id(unsigned int cpu)
700{
701 unsigned int cpu_bit;
702
703 if (cpu >= NR_GIC_CPU_IF)
704 return -1;
705 cpu_bit = gic_cpu_map[cpu];
706 if (cpu_bit & (cpu_bit - 1))
707 return -1;
708 return __ffs(cpu_bit);
669} 709}
710
711/*
712 * gic_migrate_target - migrate IRQs to another CPU interface
713 *
714 * @new_cpu_id: the CPU target ID to migrate IRQs to
715 *
716 * Migrate all peripheral interrupts with a target matching the current CPU
717 * to the interface corresponding to @new_cpu_id. The CPU interface mapping
718 * is also updated. Targets to other CPU interfaces are unchanged.
719 * This must be called with IRQs locally disabled.
720 */
721void gic_migrate_target(unsigned int new_cpu_id)
722{
723 unsigned int cur_cpu_id, gic_irqs, gic_nr = 0;
724 void __iomem *dist_base;
725 int i, ror_val, cpu = smp_processor_id();
726 u32 val, cur_target_mask, active_mask;
727
728 if (gic_nr >= MAX_GIC_NR)
729 BUG();
730
731 dist_base = gic_data_dist_base(&gic_data[gic_nr]);
732 if (!dist_base)
733 return;
734 gic_irqs = gic_data[gic_nr].gic_irqs;
735
736 cur_cpu_id = __ffs(gic_cpu_map[cpu]);
737 cur_target_mask = 0x01010101 << cur_cpu_id;
738 ror_val = (cur_cpu_id - new_cpu_id) & 31;
739
740 raw_spin_lock(&irq_controller_lock);
741
742 /* Update the target interface for this logical CPU */
743 gic_cpu_map[cpu] = 1 << new_cpu_id;
744
745 /*
746 * Find all the peripheral interrupts targetting the current
747 * CPU interface and migrate them to the new CPU interface.
748 * We skip DIST_TARGET 0 to 7 as they are read-only.
749 */
750 for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
751 val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
752 active_mask = val & cur_target_mask;
753 if (active_mask) {
754 val &= ~active_mask;
755 val |= ror32(active_mask, ror_val);
756 writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4);
757 }
758 }
759
760 raw_spin_unlock(&irq_controller_lock);
761
762 /*
763 * Now let's migrate and clear any potential SGIs that might be
764 * pending for us (cur_cpu_id). Since GIC_DIST_SGI_PENDING_SET
765 * is a banked register, we can only forward the SGI using
766 * GIC_DIST_SOFTINT. The original SGI source is lost but Linux
767 * doesn't use that information anyway.
768 *
769 * For the same reason we do not adjust SGI source information
770 * for previously sent SGIs by us to other CPUs either.
771 */
772 for (i = 0; i < 16; i += 4) {
773 int j;
774 val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
775 if (!val)
776 continue;
777 writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
778 for (j = i; j < i + 4; j++) {
779 if (val & 0xff)
780 writel_relaxed((1 << (new_cpu_id + 16)) | j,
781 dist_base + GIC_DIST_SOFTINT);
782 val >>= 8;
783 }
784 }
785}
786
787/*
788 * gic_get_sgir_physaddr - get the physical address for the SGI register
789 *
790 * REturn the physical address of the SGI register to be used
791 * by some early assembly code when the kernel is not yet available.
792 */
793static unsigned long gic_dist_physaddr;
794
795unsigned long gic_get_sgir_physaddr(void)
796{
797 if (!gic_dist_physaddr)
798 return 0;
799 return gic_dist_physaddr + GIC_DIST_SOFTINT;
800}
801
802void __init gic_init_physaddr(struct device_node *node)
803{
804 struct resource res;
805 if (of_address_to_resource(node, 0, &res) == 0) {
806 gic_dist_physaddr = res.start;
807 pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
808 }
809}
810
811#else
812#define gic_init_physaddr(node) do { } while (0)
670#endif 813#endif
671 814
672static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, 815static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
850 percpu_offset = 0; 993 percpu_offset = 0;
851 994
852 gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node); 995 gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
996 if (!gic_cnt)
997 gic_init_physaddr(node);
853 998
854 if (parent) { 999 if (parent) {
855 irq = irq_of_parse_and_map(node, 0); 1000 irq = irq_of_parse_and_map(node, 0);
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index c3785edc0e92..d135c76c4855 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -62,6 +62,7 @@ static unsigned int fmax = 515633;
62 * @signal_direction: input/out direction of bus signals can be indicated 62 * @signal_direction: input/out direction of bus signals can be indicated
63 * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock 63 * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
64 * @busy_detect: true if busy detection on dat0 is supported 64 * @busy_detect: true if busy detection on dat0 is supported
65 * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
65 */ 66 */
66struct variant_data { 67struct variant_data {
67 unsigned int clkreg; 68 unsigned int clkreg;
@@ -76,6 +77,7 @@ struct variant_data {
76 bool signal_direction; 77 bool signal_direction;
77 bool pwrreg_clkgate; 78 bool pwrreg_clkgate;
78 bool busy_detect; 79 bool busy_detect;
80 bool pwrreg_nopower;
79}; 81};
80 82
81static struct variant_data variant_arm = { 83static struct variant_data variant_arm = {
@@ -109,6 +111,7 @@ static struct variant_data variant_u300 = {
109 .pwrreg_powerup = MCI_PWR_ON, 111 .pwrreg_powerup = MCI_PWR_ON,
110 .signal_direction = true, 112 .signal_direction = true,
111 .pwrreg_clkgate = true, 113 .pwrreg_clkgate = true,
114 .pwrreg_nopower = true,
112}; 115};
113 116
114static struct variant_data variant_nomadik = { 117static struct variant_data variant_nomadik = {
@@ -121,6 +124,7 @@ static struct variant_data variant_nomadik = {
121 .pwrreg_powerup = MCI_PWR_ON, 124 .pwrreg_powerup = MCI_PWR_ON,
122 .signal_direction = true, 125 .signal_direction = true,
123 .pwrreg_clkgate = true, 126 .pwrreg_clkgate = true,
127 .pwrreg_nopower = true,
124}; 128};
125 129
126static struct variant_data variant_ux500 = { 130static struct variant_data variant_ux500 = {
@@ -135,6 +139,7 @@ static struct variant_data variant_ux500 = {
135 .signal_direction = true, 139 .signal_direction = true,
136 .pwrreg_clkgate = true, 140 .pwrreg_clkgate = true,
137 .busy_detect = true, 141 .busy_detect = true,
142 .pwrreg_nopower = true,
138}; 143};
139 144
140static struct variant_data variant_ux500v2 = { 145static struct variant_data variant_ux500v2 = {
@@ -150,6 +155,7 @@ static struct variant_data variant_ux500v2 = {
150 .signal_direction = true, 155 .signal_direction = true,
151 .pwrreg_clkgate = true, 156 .pwrreg_clkgate = true,
152 .busy_detect = true, 157 .busy_detect = true,
158 .pwrreg_nopower = true,
153}; 159};
154 160
155static int mmci_card_busy(struct mmc_host *mmc) 161static int mmci_card_busy(struct mmc_host *mmc)
@@ -189,6 +195,21 @@ static int mmci_validate_data(struct mmci_host *host,
189 return 0; 195 return 0;
190} 196}
191 197
198static void mmci_reg_delay(struct mmci_host *host)
199{
200 /*
201 * According to the spec, at least three feedback clock cycles
202 * of max 52 MHz must pass between two writes to the MMCICLOCK reg.
203 * Three MCLK clock cycles must pass between two MMCIPOWER reg writes.
204 * Worst delay time during card init is at 100 kHz => 30 us.
205 * Worst delay time when up and running is at 25 MHz => 120 ns.
206 */
207 if (host->cclk < 25000000)
208 udelay(30);
209 else
210 ndelay(120);
211}
212
192/* 213/*
193 * This must be called with host->lock held 214 * This must be called with host->lock held
194 */ 215 */
@@ -1264,6 +1285,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
1264 1285
1265 mmci_set_clkreg(host, ios->clock); 1286 mmci_set_clkreg(host, ios->clock);
1266 mmci_write_pwrreg(host, pwr); 1287 mmci_write_pwrreg(host, pwr);
1288 mmci_reg_delay(host);
1267 1289
1268 spin_unlock_irqrestore(&host->lock, flags); 1290 spin_unlock_irqrestore(&host->lock, flags);
1269 1291
@@ -1510,23 +1532,6 @@ static int mmci_probe(struct amba_device *dev,
1510 mmc->f_max = min(host->mclk, fmax); 1532 mmc->f_max = min(host->mclk, fmax);
1511 dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max); 1533 dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
1512 1534
1513 host->pinctrl = devm_pinctrl_get(&dev->dev);
1514 if (IS_ERR(host->pinctrl)) {
1515 ret = PTR_ERR(host->pinctrl);
1516 goto clk_disable;
1517 }
1518
1519 host->pins_default = pinctrl_lookup_state(host->pinctrl,
1520 PINCTRL_STATE_DEFAULT);
1521
1522 /* enable pins to be muxed in and configured */
1523 if (!IS_ERR(host->pins_default)) {
1524 ret = pinctrl_select_state(host->pinctrl, host->pins_default);
1525 if (ret)
1526 dev_warn(&dev->dev, "could not set default pins\n");
1527 } else
1528 dev_warn(&dev->dev, "could not get default pinstate\n");
1529
1530 /* Get regulators and the supported OCR mask */ 1535 /* Get regulators and the supported OCR mask */
1531 mmc_regulator_get_supply(mmc); 1536 mmc_regulator_get_supply(mmc);
1532 if (!mmc->ocr_avail) 1537 if (!mmc->ocr_avail)
@@ -1760,6 +1765,41 @@ static int mmci_resume(struct device *dev)
1760#endif 1765#endif
1761 1766
1762#ifdef CONFIG_PM_RUNTIME 1767#ifdef CONFIG_PM_RUNTIME
1768static void mmci_save(struct mmci_host *host)
1769{
1770 unsigned long flags;
1771
1772 if (host->variant->pwrreg_nopower) {
1773 spin_lock_irqsave(&host->lock, flags);
1774
1775 writel(0, host->base + MMCIMASK0);
1776 writel(0, host->base + MMCIDATACTRL);
1777 writel(0, host->base + MMCIPOWER);
1778 writel(0, host->base + MMCICLOCK);
1779 mmci_reg_delay(host);
1780
1781 spin_unlock_irqrestore(&host->lock, flags);
1782 }
1783
1784}
1785
1786static void mmci_restore(struct mmci_host *host)
1787{
1788 unsigned long flags;
1789
1790 if (host->variant->pwrreg_nopower) {
1791 spin_lock_irqsave(&host->lock, flags);
1792
1793 writel(host->clk_reg, host->base + MMCICLOCK);
1794 writel(host->datactrl_reg, host->base + MMCIDATACTRL);
1795 writel(host->pwr_reg, host->base + MMCIPOWER);
1796 writel(MCI_IRQENABLE, host->base + MMCIMASK0);
1797 mmci_reg_delay(host);
1798
1799 spin_unlock_irqrestore(&host->lock, flags);
1800 }
1801}
1802
1763static int mmci_runtime_suspend(struct device *dev) 1803static int mmci_runtime_suspend(struct device *dev)
1764{ 1804{
1765 struct amba_device *adev = to_amba_device(dev); 1805 struct amba_device *adev = to_amba_device(dev);
@@ -1767,6 +1807,8 @@ static int mmci_runtime_suspend(struct device *dev)
1767 1807
1768 if (mmc) { 1808 if (mmc) {
1769 struct mmci_host *host = mmc_priv(mmc); 1809 struct mmci_host *host = mmc_priv(mmc);
1810 pinctrl_pm_select_sleep_state(dev);
1811 mmci_save(host);
1770 clk_disable_unprepare(host->clk); 1812 clk_disable_unprepare(host->clk);
1771 } 1813 }
1772 1814
@@ -1781,6 +1823,8 @@ static int mmci_runtime_resume(struct device *dev)
1781 if (mmc) { 1823 if (mmc) {
1782 struct mmci_host *host = mmc_priv(mmc); 1824 struct mmci_host *host = mmc_priv(mmc);
1783 clk_prepare_enable(host->clk); 1825 clk_prepare_enable(host->clk);
1826 mmci_restore(host);
1827 pinctrl_pm_select_default_state(dev);
1784 } 1828 }
1785 1829
1786 return 0; 1830 return 0;
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index 69080fab6375..168bc72f7a94 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -200,10 +200,6 @@ struct mmci_host {
200 struct sg_mapping_iter sg_miter; 200 struct sg_mapping_iter sg_miter;
201 unsigned int size; 201 unsigned int size;
202 202
203 /* pinctrl handles */
204 struct pinctrl *pinctrl;
205 struct pinctrl_state *pins_default;
206
207#ifdef CONFIG_DMA_ENGINE 203#ifdef CONFIG_DMA_ENGINE
208 /* DMA stuff */ 204 /* DMA stuff */
209 struct dma_chan *dma_current; 205 struct dma_chan *dma_current;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 682df0e1954a..63b5eff0a80f 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -21,7 +21,7 @@
21#include <linux/resource.h> 21#include <linux/resource.h>
22#include <linux/regulator/consumer.h> 22#include <linux/regulator/consumer.h>
23 23
24#define AMBA_NR_IRQS 2 24#define AMBA_NR_IRQS 9
25#define AMBA_CID 0xb105f00d 25#define AMBA_CID 0xb105f00d
26 26
27struct clk; 27struct clk;
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 0e5d9ecdb2b6..cac496b1e279 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -31,6 +31,8 @@
31#define GIC_DIST_TARGET 0x800 31#define GIC_DIST_TARGET 0x800
32#define GIC_DIST_CONFIG 0xc00 32#define GIC_DIST_CONFIG 0xc00
33#define GIC_DIST_SOFTINT 0xf00 33#define GIC_DIST_SOFTINT 0xf00
34#define GIC_DIST_SGI_PENDING_CLEAR 0xf10
35#define GIC_DIST_SGI_PENDING_SET 0xf20
34 36
35#define GICH_HCR 0x0 37#define GICH_HCR 0x0
36#define GICH_VTR 0x4 38#define GICH_VTR 0x4
@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start,
74 gic_init_bases(nr, start, dist, cpu, 0, NULL); 76 gic_init_bases(nr, start, dist, cpu, 0, NULL);
75} 77}
76 78
79void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
80int gic_get_cpu_id(unsigned int cpu);
81void gic_migrate_target(unsigned int new_cpu_id);
82unsigned long gic_get_sgir_physaddr(void);
83
77#endif /* __ASSEMBLY */ 84#endif /* __ASSEMBLY */
78 85
79#endif 86#endif
diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h
new file mode 100644
index 000000000000..f76dd4de625e
--- /dev/null
+++ b/include/trace/events/power_cpu_migrate.h
@@ -0,0 +1,67 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM power
3
4#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_POWER_CPU_MIGRATE_H
6
7#include <linux/tracepoint.h>
8
9#define __cpu_migrate_proto \
10 TP_PROTO(u64 timestamp, \
11 u32 cpu_hwid)
12#define __cpu_migrate_args \
13 TP_ARGS(timestamp, \
14 cpu_hwid)
15
16DECLARE_EVENT_CLASS(cpu_migrate,
17
18 __cpu_migrate_proto,
19 __cpu_migrate_args,
20
21 TP_STRUCT__entry(
22 __field(u64, timestamp )
23 __field(u32, cpu_hwid )
24 ),
25
26 TP_fast_assign(
27 __entry->timestamp = timestamp;
28 __entry->cpu_hwid = cpu_hwid;
29 ),
30
31 TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
32 (unsigned long long)__entry->timestamp,
33 (unsigned long)__entry->cpu_hwid
34 )
35);
36
37#define __define_cpu_migrate_event(name) \
38 DEFINE_EVENT(cpu_migrate, cpu_migrate_##name, \
39 __cpu_migrate_proto, \
40 __cpu_migrate_args \
41 )
42
43__define_cpu_migrate_event(begin);
44__define_cpu_migrate_event(finish);
45__define_cpu_migrate_event(current);
46
47#undef __define_cpu_migrate
48#undef __cpu_migrate_proto
49#undef __cpu_migrate_args
50
51/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
52#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
53#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
54
55/*
56 * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
57 * a whole-cluster migration:
58 */
59#define CPU_MIGRATE_ALL_CPUS 0x80000000U
60#endif
61
62#endif /* _TRACE_POWER_CPU_MIGRATE_H */
63
64/* This part must be outside protection */
65#undef TRACE_INCLUDE_FILE
66#define TRACE_INCLUDE_FILE power_cpu_migrate
67#include <trace/define_trace.h>
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile
index 15130b50dfe3..fe9b61e322a5 100644
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -2,3 +2,6 @@ ifndef NO_DWARF
2PERF_HAVE_DWARF_REGS := 1 2PERF_HAVE_DWARF_REGS := 1
3LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o 3LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
4endif 4endif
5ifndef NO_LIBUNWIND
6LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
7endif
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
new file mode 100644
index 000000000000..2a1cfde66b69
--- /dev/null
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -0,0 +1,54 @@
1#ifndef ARCH_PERF_REGS_H
2#define ARCH_PERF_REGS_H
3
4#include <stdlib.h>
5#include "../../util/types.h"
6#include <asm/perf_regs.h>
7
8#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1)
9#define PERF_REG_IP PERF_REG_ARM_PC
10#define PERF_REG_SP PERF_REG_ARM_SP
11
12static inline const char *perf_reg_name(int id)
13{
14 switch (id) {
15 case PERF_REG_ARM_R0:
16 return "r0";
17 case PERF_REG_ARM_R1:
18 return "r1";
19 case PERF_REG_ARM_R2:
20 return "r2";
21 case PERF_REG_ARM_R3:
22 return "r3";
23 case PERF_REG_ARM_R4:
24 return "r4";
25 case PERF_REG_ARM_R5:
26 return "r5";
27 case PERF_REG_ARM_R6:
28 return "r6";
29 case PERF_REG_ARM_R7:
30 return "r7";
31 case PERF_REG_ARM_R8:
32 return "r8";
33 case PERF_REG_ARM_R9:
34 return "r9";
35 case PERF_REG_ARM_R10:
36 return "r10";
37 case PERF_REG_ARM_FP:
38 return "fp";
39 case PERF_REG_ARM_IP:
40 return "ip";
41 case PERF_REG_ARM_SP:
42 return "sp";
43 case PERF_REG_ARM_LR:
44 return "lr";
45 case PERF_REG_ARM_PC:
46 return "pc";
47 default:
48 return NULL;
49 }
50
51 return NULL;
52}
53
54#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm/util/unwind.c b/tools/perf/arch/arm/util/unwind.c
new file mode 100644
index 000000000000..da3dc950550c
--- /dev/null
+++ b/tools/perf/arch/arm/util/unwind.c
@@ -0,0 +1,48 @@
1
2#include <errno.h>
3#include <libunwind.h>
4#include "perf_regs.h"
5#include "../../util/unwind.h"
6
7int unwind__arch_reg_id(int regnum)
8{
9 switch (regnum) {
10 case UNW_ARM_R0:
11 return PERF_REG_ARM_R0;
12 case UNW_ARM_R1:
13 return PERF_REG_ARM_R1;
14 case UNW_ARM_R2:
15 return PERF_REG_ARM_R2;
16 case UNW_ARM_R3:
17 return PERF_REG_ARM_R3;
18 case UNW_ARM_R4:
19 return PERF_REG_ARM_R4;
20 case UNW_ARM_R5:
21 return PERF_REG_ARM_R5;
22 case UNW_ARM_R6:
23 return PERF_REG_ARM_R6;
24 case UNW_ARM_R7:
25 return PERF_REG_ARM_R7;
26 case UNW_ARM_R8:
27 return PERF_REG_ARM_R8;
28 case UNW_ARM_R9:
29 return PERF_REG_ARM_R9;
30 case UNW_ARM_R10:
31 return PERF_REG_ARM_R10;
32 case UNW_ARM_R11:
33 return PERF_REG_ARM_FP;
34 case UNW_ARM_R12:
35 return PERF_REG_ARM_IP;
36 case UNW_ARM_R13:
37 return PERF_REG_ARM_SP;
38 case UNW_ARM_R14:
39 return PERF_REG_ARM_LR;
40 case UNW_ARM_R15:
41 return PERF_REG_ARM_PC;
42 default:
43 pr_err("unwind: invalid reg id %d\n", regnum);
44 return -EINVAL;
45 }
46
47 return -EINVAL;
48}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 58b2d37ae23a..f5905f2b197d 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -31,6 +31,10 @@ ifeq ($(ARCH),x86_64)
31 endif 31 endif
32 NO_PERF_REGS := 0 32 NO_PERF_REGS := 0
33endif 33endif
34ifeq ($(ARCH),arm)
35 NO_PERF_REGS := 0
36 LIBUNWIND_LIBS = -lunwind -lunwind-arm
37endif
34 38
35ifeq ($(NO_PERF_REGS),0) 39ifeq ($(NO_PERF_REGS),0)
36 CFLAGS += -DHAVE_PERF_REGS_SUPPORT 40 CFLAGS += -DHAVE_PERF_REGS_SUPPORT
@@ -305,8 +309,7 @@ ifndef NO_LIBELF
305 endif # NO_DWARF 309 endif # NO_DWARF
306endif # NO_LIBELF 310endif # NO_LIBELF
307 311
308# There's only x86 (both 32 and 64) support for CFI unwind so far 312ifeq ($(LIBUNWIND_LIBS),)
309ifneq ($(ARCH),x86)
310 NO_LIBUNWIND := 1 313 NO_LIBUNWIND := 1
311endif 314endif
312 315
@@ -322,8 +325,13 @@ ifndef NO_LIBUNWIND
322 endif 325 endif
323 326
324 ifneq ($(feature-libunwind), 1) 327 ifneq ($(feature-libunwind), 1)
325 msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99); 328 msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
326 NO_LIBUNWIND := 1 329 NO_LIBUNWIND := 1
330 else
331 ifneq ($(feature-libunwind-debug-frame), 1)
332 msg := $(warning No debug_frame support found in libunwind);
333 CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
334 endif
327 endif 335 endif
328endif 336endif
329 337
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile
index c803f17fb986..e8e195f49a4e 100644
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/perf/config/feature-checks/Makefile
@@ -23,6 +23,7 @@ FILES= \
23 test-libpython-version \ 23 test-libpython-version \
24 test-libslang \ 24 test-libslang \
25 test-libunwind \ 25 test-libunwind \
26 test-libunwind-debug-frame \
26 test-on-exit \ 27 test-on-exit \
27 test-stackprotector-all \ 28 test-stackprotector-all \
28 test-stackprotector \ 29 test-stackprotector \
diff --git a/tools/perf/config/feature-checks/test-all.c b/tools/perf/config/feature-checks/test-all.c
index 59e7a705e146..799865b60772 100644
--- a/tools/perf/config/feature-checks/test-all.c
+++ b/tools/perf/config/feature-checks/test-all.c
@@ -49,6 +49,10 @@
49# include "test-libunwind.c" 49# include "test-libunwind.c"
50#undef main 50#undef main
51 51
52#define main main_test_libunwind_debug_frame
53# include "test-libunwind-debug-frame.c"
54#undef main
55
52#define main main_test_libaudit 56#define main main_test_libaudit
53# include "test-libaudit.c" 57# include "test-libaudit.c"
54#undef main 58#undef main
diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
new file mode 100644
index 000000000000..0ef8087a104a
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
@@ -0,0 +1,16 @@
1#include <libunwind.h>
2#include <stdlib.h>
3
4extern int
5UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
6 unw_word_t ip, unw_word_t segbase,
7 const char *obj_name, unw_word_t start,
8 unw_word_t end);
9
10#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
11
12int main(void)
13{
14 dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
15 return 0;
16}
diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c
index 2f891f7e70bf..5390d0b8862a 100644
--- a/tools/perf/util/unwind.c
+++ b/tools/perf/util/unwind.c
@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
39 39
40#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) 40#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
41 41
42extern int
43UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
44 unw_word_t ip,
45 unw_word_t segbase,
46 const char *obj_name, unw_word_t start,
47 unw_word_t end);
48
49#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
50
42#define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */ 51#define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */
43#define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */ 52#define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */
44 53
@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
245 return 0; 254 return 0;
246} 255}
247 256
248static int read_unwind_spec(struct dso *dso, struct machine *machine, 257static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
249 u64 *table_data, u64 *segbase, u64 *fde_count) 258 u64 *table_data, u64 *segbase,
259 u64 *fde_count)
250{ 260{
251 int ret = -EINVAL, fd; 261 int ret = -EINVAL, fd;
252 u64 offset; 262 u64 offset;
@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
255 if (fd < 0) 265 if (fd < 0)
256 return -EINVAL; 266 return -EINVAL;
257 267
268 /* Check the .eh_frame section for unwinding info */
258 offset = elf_section_offset(fd, ".eh_frame_hdr"); 269 offset = elf_section_offset(fd, ".eh_frame_hdr");
259 close(fd); 270 close(fd);
260 271
@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
263 table_data, segbase, 274 table_data, segbase,
264 fde_count); 275 fde_count);
265 276
266 /* TODO .debug_frame check if eh_frame_hdr fails */
267 return ret; 277 return ret;
268} 278}
269 279
280#ifndef NO_LIBUNWIND_DEBUG_FRAME
281static int read_unwind_spec_debug_frame(struct dso *dso,
282 struct machine *machine, u64 *offset)
283{
284 int fd = dso__data_fd(dso, machine);
285
286 if (fd < 0)
287 return -EINVAL;
288
289 /* Check the .debug_frame section for unwinding info */
290 *offset = elf_section_offset(fd, ".debug_frame");
291 close(fd);
292
293 if (*offset)
294 return 0;
295
296 return -EINVAL;
297}
298#endif
299
270static struct map *find_map(unw_word_t ip, struct unwind_info *ui) 300static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
271{ 301{
272 struct addr_location al; 302 struct addr_location al;
@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
291 321
292 pr_debug("unwind: find_proc_info dso %s\n", map->dso->name); 322 pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
293 323
294 if (read_unwind_spec(map->dso, ui->machine, 324 /* Check the .eh_frame section for unwinding info */
295 &table_data, &segbase, &fde_count)) 325 if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
296 return -EINVAL; 326 &table_data, &segbase, &fde_count)) {
327 memset(&di, 0, sizeof(di));
328 di.format = UNW_INFO_FORMAT_REMOTE_TABLE;
329 di.start_ip = map->start;
330 di.end_ip = map->end;
331 di.u.rti.segbase = map->start + segbase;
332 di.u.rti.table_data = map->start + table_data;
333 di.u.rti.table_len = fde_count * sizeof(struct table_entry)
334 / sizeof(unw_word_t);
335 return dwarf_search_unwind_table(as, ip, &di, pi,
336 need_unwind_info, arg);
337 }
338
339#ifndef NO_LIBUNWIND_DEBUG_FRAME
340 /* Check the .debug_frame section for unwinding info */
341 if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
342 memset(&di, 0, sizeof(di));
343 dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name,
344 map->start, map->end);
345 return dwarf_search_unwind_table(as, ip, &di, pi,
346 need_unwind_info, arg);
347 }
348#endif
297 349
298 memset(&di, 0, sizeof(di)); 350 return -EINVAL;
299 di.format = UNW_INFO_FORMAT_REMOTE_TABLE;
300 di.start_ip = map->start;
301 di.end_ip = map->end;
302 di.u.rti.segbase = map->start + segbase;
303 di.u.rti.table_data = map->start + table_data;
304 di.u.rti.table_len = fde_count * sizeof(struct table_entry)
305 / sizeof(unw_word_t);
306 return dwarf_search_unwind_table(as, ip, &di, pi,
307 need_unwind_info, arg);
308} 351}
309 352
310static int access_fpreg(unw_addr_space_t __maybe_unused as, 353static int access_fpreg(unw_addr_space_t __maybe_unused as,