From 1fabaddd153e9e2f7e5695266abb56c934629718 Mon Sep 17 00:00:00 2001 From: Alim Akhtar Date: Mon, 11 Jul 2016 09:34:18 +0200 Subject: arm64: dts: Fix RTC by providing rtc_src clock Add RTC source clock as Exynos7 needs source (32.768KHz) clock for RTC block. Without this currently S3C RTC driver probe is broken on this SoC. Signed-off-by: Alim Akhtar Reviewed-by: Javier Martinez Canillas Signed-off-by: Krzysztof Kozlowski Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/exynos/exynos7-espresso.dts | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/exynos/exynos7-espresso.dts b/arch/arm64/boot/dts/exynos/exynos7-espresso.dts index 299f3ce969ab..c528dd52ba2d 100644 --- a/arch/arm64/boot/dts/exynos/exynos7-espresso.dts +++ b/arch/arm64/boot/dts/exynos/exynos7-espresso.dts @@ -12,6 +12,7 @@ /dts-v1/; #include "exynos7.dtsi" #include +#include / { model = "Samsung Exynos7 Espresso board based on EXYNOS7"; @@ -43,6 +44,8 @@ &rtc { status = "okay"; + clocks = <&clock_ccore PCLK_RTC>, <&s2mps15_osc S2MPS11_CLK_AP>; + clock-names = "rtc", "rtc_src"; }; &watchdog { -- cgit v1.2.2 From d97ffe788184313fd83ff7da4b6587156f8388f0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 2 Aug 2016 09:44:33 +0200 Subject: ARM: do away with final ARCH_REQUIRE_GPIOLIB A new user of the Kconfig selection of ARCH_REQUIRE_GPIOLIB has appeared. Replace with just selecting GPIOLIB. Signed-off-by: Linus Walleij Signed-off-by: Olof Johansson --- arch/arm/mach-clps711x/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/mach-clps711x/Kconfig b/arch/arm/mach-clps711x/Kconfig index dc7c6edeab39..61284b9389cf 100644 --- a/arch/arm/mach-clps711x/Kconfig +++ b/arch/arm/mach-clps711x/Kconfig @@ -1,13 +1,13 @@ menuconfig ARCH_CLPS711X bool "Cirrus Logic EP721x/EP731x-based" depends on ARCH_MULTI_V4T - select ARCH_REQUIRE_GPIOLIB select AUTO_ZRELADDR select CLKSRC_OF select CLPS711X_TIMER select COMMON_CLK select CPU_ARM720T select GENERIC_CLOCKEVENTS + select GPIOLIB select MFD_SYSCON select OF_IRQ select USE_OF -- cgit v1.2.2 From 869ec056fa8450184423c8076e0a342db127795c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Jun 2016 14:25:19 +0200 Subject: ARM: shmobile: don't call platform_can_secondary_boot on UP For rcar-gen2, we build the SMP files even for UP configurations, and that just broke: arch/arm/mach-shmobile/built-in.o: In function `shmobile_smp_init_fallback_ops': pm-rcar-gen2.c:(.init.text+0x40c): undefined reference to `platform_can_secondary_boot' This adds an compile-time check before the call to platform_can_secondary_boot, turning the function into an empty stub for UP configurations. Signed-off-by: Arnd Bergmann Fixes: c21af444eace ("ARM: shmobile: smp: Add function to prioritize DT SMP") Acked-by: Geert Uytterhoeven Signed-off-by: Olof Johansson --- arch/arm/mach-shmobile/platsmp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm/mach-shmobile/platsmp.c b/arch/arm/mach-shmobile/platsmp.c index f3dba6f356e2..02e21bceb085 100644 --- a/arch/arm/mach-shmobile/platsmp.c +++ b/arch/arm/mach-shmobile/platsmp.c @@ -40,5 +40,8 @@ bool shmobile_smp_cpu_can_disable(unsigned int cpu) bool __init shmobile_smp_init_fallback_ops(void) { /* fallback on PSCI/smp_ops if no other DT based method is detected */ + if (!IS_ENABLED(CONFIG_SMP)) + return false; + return platform_can_secondary_boot() ? true : false; } -- cgit v1.2.2 From 7a376ac11fc2109dfd86442ff79982ecf16dcd6d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Aug 2016 11:07:44 +0200 Subject: Revert "ARM: aspeed: adapt defconfigs for new CONFIG_PRINTK_TIME" The patch that this was preparing for made it into neither v4.7 nor v4.8, so we should back this out as well to avoid the opposite warning: arch/arm/configs/aspeed_g5_defconfig:62:warning: symbol value '1' invalid for PRINTK_TIME arch/arm/configs/aspeed_g4_defconfig:61:warning: symbol value '1' invalid for PRINTK_TIME Sorry for not catching this earlier. Signed-off-by: Arnd Bergmann Fixes: 0ef659a30055 ("ARM: aspeed: adapt defconfigs for new CONFIG_PRINTK_TIME") Cc: stable@vger.kernel.org # v4.7 --- arch/arm/configs/aspeed_g4_defconfig | 2 +- arch/arm/configs/aspeed_g5_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig index b6e54ee9bdbd..ca39c04fec6b 100644 --- a/arch/arm/configs/aspeed_g4_defconfig +++ b/arch/arm/configs/aspeed_g4_defconfig @@ -58,7 +58,7 @@ CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_IOMMU_SUPPORT is not set CONFIG_FIRMWARE_MEMMAP=y CONFIG_FANOTIFY=y -CONFIG_PRINTK_TIME=1 +CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_STRIP_ASM_SYMS=y CONFIG_PAGE_POISONING=y diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 892605167357..4f366b0370e9 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -59,7 +59,7 @@ CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_IOMMU_SUPPORT is not set CONFIG_FIRMWARE_MEMMAP=y CONFIG_FANOTIFY=y -CONFIG_PRINTK_TIME=1 +CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_STRIP_ASM_SYMS=y CONFIG_PAGE_POISONING=y -- cgit v1.2.2 From 0b98027122d0715aa2dc39882e4c295112275785 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 29 Jul 2016 14:26:53 +1000 Subject: m68knommu: fix user a5 register being overwritten On no-MMU systems the application a5 register can be overwitten with the address of the process data segment when processing application signals. For flat format applications compiled with full absolute relocation this effectively corrupts the a5 register on signal processing - and this very quickly leads to process crash and often takes out the whole system with a panic as well. This has no effect on flat format applications compiled with the more common PIC methods (such as -msep-data). These format applications reserve a5 for the pointer to the data segment anyway - so it doesn't change it. A long time ago the a5 register was used in the code packed into the user stack to enable signal return processing. And so it had to be restored on end of signal cleanup processing back to the original a5 user value. This was historically done by saving away a5 in the sigcontext structure. At some point (a long time back it seems) the a5 restore process was changed and it was hard coded to put the user data segment address directly into a5. Which is ok for the common PIC compiled application case, but breaks the full relocation application code. We no longer use this type of signal handling mechanism and so we don't need to do anything special to save and restore a5 at all now. So remove the code that hard codes a5 to the address of the user data segment. Signed-off-by: Greg Ungerer --- arch/m68k/kernel/signal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 2dcee3a88867..9202f82dfce6 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -213,7 +213,6 @@ static inline int frame_extra_sizes(int f) static inline void adjustformat(struct pt_regs *regs) { - ((struct switch_stack *)regs - 1)->a5 = current->mm->start_data; /* * set format byte to make stack appear modulo 4, which it will * be when doing the rte -- cgit v1.2.2 From 416f37d0816b9720b8227953e55954d81456f991 Mon Sep 17 00:00:00 2001 From: Darren Stevens Date: Wed, 27 Jul 2016 15:41:29 +0100 Subject: powerpc/pasemi: Fix coherent_dma_mask for dma engine Commit 817820b0226a ("powerpc/iommu: Support "hybrid" iommu/direct DMA ops for coherent_mask < dma_mask) adds a check of coherent_dma_mask for dma allocations. Unfortunately current PASemi code does not set this value for the DMA engine, which ends up with the default value of 0xffffffff, the result is on a PASemi system with >2Gb ram and iommu enabled the the onboard ethernet stops working due to an inability to allocate memory. Add an initialisation to pci_dma_dev_setup_pasemi(). Signed-off-by: Darren Stevens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pasemi/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index 309d9ccccd50..c61667e8bb06 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -187,6 +187,11 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev) if (dev->vendor == 0x1959 && dev->device == 0xa007 && !firmware_has_feature(FW_FEATURE_LPAR)) { dev->dev.archdata.dma_ops = &dma_direct_ops; + /* + * Set the coherent DMA mask to prevent the iommu + * being used unnecessarily + */ + dev->dev.coherent_dma_mask = DMA_BIT_MASK(44); return; } #endif -- cgit v1.2.2 From bf47dc572aaf53c3f2311c71e03ec01be3131fd6 Mon Sep 17 00:00:00 2001 From: Sascha Silbe Date: Mon, 4 Jul 2016 17:16:47 +0200 Subject: s390: clarify compressed image code path The way the decompressor is hooked into the start-up code is rather subtle, with a mix of multiply-defined symbols and hardcoded address literals. Add some comments at the junction points to clarify how it works. Signed-off-by: Sascha Silbe Signed-off-by: Martin Schwidefsky --- arch/s390/boot/compressed/head.S | 11 ++++++++--- arch/s390/kernel/head.S | 4 +++- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/boot/compressed/head.S b/arch/s390/boot/compressed/head.S index f86a4eef28a9..28c4f96a2d9c 100644 --- a/arch/s390/boot/compressed/head.S +++ b/arch/s390/boot/compressed/head.S @@ -21,16 +21,21 @@ ENTRY(startup_continue) lg %r15,.Lstack-.LPG1(%r13) aghi %r15,-160 brasl %r14,decompress_kernel - # setup registers for memory mover & branch to target + # Set up registers for memory mover. We move the decompressed image to + # 0x11000, starting at offset 0x11000 in the decompressed image so + # that code living at 0x11000 in the image will end up at 0x11000 in + # memory. lgr %r4,%r2 lg %r2,.Loffset-.LPG1(%r13) la %r4,0(%r2,%r4) lg %r3,.Lmvsize-.LPG1(%r13) lgr %r5,%r3 - # move the memory mover someplace safe + # Move the memory mover someplace safe so it doesn't overwrite itself. la %r1,0x200 mvc 0(mover_end-mover,%r1),mover-.LPG1(%r13) - # decompress image is started at 0x11000 + # When the memory mover is done we pass control to + # arch/s390/kernel/head64.S:startup_continue which lives at 0x11000 in + # the decompressed image. lgr %r6,%r2 br %r1 mover: diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index 56e4d8234ef2..4431905f8cfa 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -309,7 +309,9 @@ ENTRY(startup_kdump) l %r15,.Lstack-.LPG0(%r13) ahi %r15,-STACK_FRAME_OVERHEAD brasl %r14,verify_facilities - /* Continue with startup code in head64.S */ +# For uncompressed images, continue in +# arch/s390/kernel/head64.S. For compressed images, continue in +# arch/s390/boot/compressed/head.S. jg startup_continue .Lstack: -- cgit v1.2.2 From 134a24cd895c5d138d639a0741ad27e389cd4562 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 3 Aug 2016 17:55:00 +0200 Subject: s390/crc32-vx: Fix checksum calculation for small sizes The current prealign logic will fail for sizes < alignment, as the new datalen passed to the vector function is smaller than zero. Being a size_t this gets wrapped to a huge number causing memory overruns and wrong data. Let's add an early exit if the size is smaller than the minimal size with alignment. This will also avoid calling the software fallback twice for all sizes smaller than the minimum size (prealign + remaining) Signed-off-by: Christian Borntraeger Fixes: f848dbd3bc1a ("s390/crc32-vx: add crypto API module for optimized CRC-32 algorithms") Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/crc32-vx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c index 577ae1d4ae89..2bad9d837029 100644 --- a/arch/s390/crypto/crc32-vx.c +++ b/arch/s390/crypto/crc32-vx.c @@ -51,6 +51,9 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); struct kernel_fpu vxstate; \ unsigned long prealign, aligned, remaining; \ \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \ + return ___crc32_sw(crc, data, datalen); \ + \ if ((unsigned long)data & VX_ALIGN_MASK) { \ prealign = VX_ALIGNMENT - \ ((unsigned long)data & VX_ALIGN_MASK); \ @@ -59,9 +62,6 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); data = (void *)((unsigned long)data + prealign); \ } \ \ - if (datalen < VX_MIN_LEN) \ - return ___crc32_sw(crc, data, datalen); \ - \ aligned = datalen & ~VX_ALIGN_MASK; \ remaining = datalen & VX_ALIGN_MASK; \ \ -- cgit v1.2.2 From e2efc424549eb66d227dfe6d073d1789a24bc698 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Aug 2016 10:24:30 +0200 Subject: s390/lib: fix memcmp and strstr if two string compare equal the clcle instruction will update the string addresses to point _after_ the string. This might already be on a different page, so we should not use these pointer to calculate the difference as in that case the calculation of the difference can cause oopses. The return value of memcmp does not need the difference, we can just reuse the condition code and return for CC=1 (All bytes compared, first operand low) -1 and for CC=2 (All bytes compared, first operand high) +1 strstr also does not need the diff. While fixing this, make the common function clcle "correct on its own" by using l1 instead of l2 for the first length. strstr will call this with l2 for both strings. Signed-off-by: Christian Borntraeger Fixes: db7f5eef3dc0 ("s390/lib: use basic blocks for inline assemblies") Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/lib/string.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index e390bbb16443..48352bffbc92 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -237,11 +237,10 @@ char * strrchr(const char * s, int c) EXPORT_SYMBOL(strrchr); static inline int clcle(const char *s1, unsigned long l1, - const char *s2, unsigned long l2, - int *diff) + const char *s2, unsigned long l2) { register unsigned long r2 asm("2") = (unsigned long) s1; - register unsigned long r3 asm("3") = (unsigned long) l2; + register unsigned long r3 asm("3") = (unsigned long) l1; register unsigned long r4 asm("4") = (unsigned long) s2; register unsigned long r5 asm("5") = (unsigned long) l2; int cc; @@ -252,7 +251,6 @@ static inline int clcle(const char *s1, unsigned long l1, " srl %0,28" : "=&d" (cc), "+a" (r2), "+a" (r3), "+a" (r4), "+a" (r5) : : "cc"); - *diff = *(char *)r2 - *(char *)r4; return cc; } @@ -270,9 +268,9 @@ char * strstr(const char * s1,const char * s2) return (char *) s1; l1 = __strend(s1) - s1; while (l1-- >= l2) { - int cc, dummy; + int cc; - cc = clcle(s1, l1, s2, l2, &dummy); + cc = clcle(s1, l2, s2, l2); if (!cc) return (char *) s1; s1++; @@ -313,11 +311,11 @@ EXPORT_SYMBOL(memchr); */ int memcmp(const void *cs, const void *ct, size_t n) { - int ret, diff; + int ret; - ret = clcle(cs, n, ct, n, &diff); + ret = clcle(cs, n, ct, n); if (ret) - ret = diff; + ret = ret == 1 ? -1 : 1; return ret; } EXPORT_SYMBOL(memcmp); -- cgit v1.2.2 From 6e127efeeae5bbbc323e028d96c43dec0cc7b1b8 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 4 Aug 2016 12:51:09 +0200 Subject: s390/config: make the vector optimized crc function builtin For all configs with CONFIG_BTRFS_FS = y we should also make the optimized crc module builtin. Otherwise early mounts will fall back to the software variant. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/configs/default_defconfig | 2 +- arch/s390/configs/gcov_defconfig | 2 +- arch/s390/configs/performance_defconfig | 2 +- arch/s390/defconfig | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 889ea3450210..26e0c7f08814 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -678,7 +678,7 @@ CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=m +CONFIG_CRYPTO_CRC32_S390=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 1bcfd764910a..24879dab47bc 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -616,7 +616,7 @@ CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=m +CONFIG_CRYPTO_CRC32_S390=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 13ff090139c8..a5c1e5f2a0ca 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -615,7 +615,7 @@ CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m -CONFIG_CRYPTO_CRC32_S390=m +CONFIG_CRYPTO_CRC32_S390=y CONFIG_ASYMMETRIC_KEY_TYPE=y CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m CONFIG_X509_CERTIFICATE_PARSER=m diff --git a/arch/s390/defconfig b/arch/s390/defconfig index ccccebeeaaf6..73610f2e3b4f 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -234,7 +234,7 @@ CONFIG_CRYPTO_SHA256_S390=m CONFIG_CRYPTO_SHA512_S390=m CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m -CONFIG_CRYPTO_CRC32_S390=m +CONFIG_CRYPTO_CRC32_S390=y CONFIG_CRC7=m # CONFIG_XZ_DEC_X86 is not set # CONFIG_XZ_DEC_POWERPC is not set -- cgit v1.2.2 From e4630fdd47637168927905983205d7b7c5c08c09 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Aug 2016 15:31:31 +0200 Subject: x86/power/64: Always create temporary identity mapping correctly The low-level resume-from-hibernation code on x86-64 uses kernel_ident_mapping_init() to create the temoprary identity mapping, but that function assumes that the offset between kernel virtual addresses and physical addresses is aligned on the PGD level. However, with a randomized identity mapping base, it may be aligned on the PUD level and if that happens, the temporary identity mapping created by set_up_temporary_mappings() will not reflect the actual kernel identity mapping and the image restoration will fail as a result (leading to a kernel panic most of the time). To fix this problem, rework kernel_ident_mapping_init() to support unaligned offsets between KVA and PA up to the PMD level and make set_up_temporary_mappings() use it as approprtiate. Reported-and-tested-by: Thomas Garnier Reported-by: Borislav Petkov Suggested-by: Yinghai Lu Signed-off-by: Rafael J. Wysocki Acked-by: Yinghai Lu --- arch/x86/include/asm/init.h | 4 ++-- arch/x86/mm/ident_map.c | 19 +++++++++++-------- arch/x86/power/hibernate_64.c | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 223042086f4e..737da62bfeb0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,10 +5,10 @@ struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ unsigned long pmd_flag; /* page flag for PMD entry */ - bool kernel_mapping; /* kernel mapping or ident mapping */ + unsigned long offset; /* ident mapping offset */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end); + unsigned long pstart, unsigned long pend); #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ec21796ac5fd..4473cb4f8b90 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -3,15 +3,17 @@ * included by both the compressed kernel and the regular kernel. */ -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, unsigned long addr, unsigned long end) { addr &= PMD_MASK; for (; addr < end; addr += PMD_SIZE) { pmd_t *pmd = pmd_page + pmd_index(addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); } } @@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); continue; } pmd = (pmd_t *)info->alloc_pgt_page(info->context); if (!pmd) return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } @@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, } int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) + unsigned long pstart, unsigned long pend) { + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; unsigned long next; int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pgd_t *pgd = pgd_page + pgd_index(addr); pud_t *pud; next = (addr & PGDIR_MASK) + PGDIR_SIZE; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f0b5f2d402af..a3e3ccc87138 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -87,7 +87,7 @@ static int set_up_temporary_mappings(void) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, - .kernel_mapping = true, + .offset = __PAGE_OFFSET, }; unsigned long mstart, mend; pgd_t *pgd; -- cgit v1.2.2 From d2cf5be07ff7c7cde8bef8551a30e8e86e2037a7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 4 Aug 2016 16:38:15 +1000 Subject: crypto: crc32c-vpmsum - Convert to CPU feature based module autoloading This patch utilises the GENERIC_CPU_AUTOPROBE infrastructure to automatically load the crc32c-vpmsum module if the CPU supports it. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/crypto/crc32c-vpmsum_glue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c index bfe3d37a24ef..9fa046d56eba 100644 --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #define CHKSUM_BLOCK_SIZE 1 @@ -157,7 +158,7 @@ static void __exit crc32c_vpmsum_mod_fini(void) crypto_unregister_shash(&alg); } -module_init(crc32c_vpmsum_mod_init); +module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crc32c_vpmsum_mod_init); module_exit(crc32c_vpmsum_mod_fini); MODULE_AUTHOR("Anton Blanchard "); -- cgit v1.2.2 From 880a3d6afd068682d6386a0528be1217541d3d8e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 2 Aug 2016 12:39:43 +1000 Subject: powerpc/xics: Properly set Edge/Level type and enable resend This sets the type of the interrupt appropriately. We set it as follow: - If not mapped from the device-tree, we use edge. This is the case of the virtual interrupts and PCI MSIs for example. - If mapped from the device-tree and #interrupt-cells is 2 (PAPR compliant), we use the second cell to set the appropriate type - If mapped from the device-tree and #interrupt-cells is 1 (current OPAL on P8 does that), we assume level sensitive since those are typically going to be the PSI LSIs which are level sensitive. Additionally, we mark the interrupts requested via the opal_interrupts property all level. This is a bit fishy but the best we can do until we fix OPAL to properly expose them with a complete descriptor. It is also correct for the current HW anyway as OPAL interrupts are currently PCI error and PSI interrupts which are level. Finally now that edge interrupts are properly identified, we can enable CONFIG_HARDIRQS_SW_RESEND which will make the core re-send them if they occur while masked, which some drivers rely upon. This fixes issues with lost interrupts on some Mellanox adapters. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/xics.h | 2 + arch/powerpc/platforms/powernv/opal-irqchip.c | 3 +- arch/powerpc/sysdev/xics/Kconfig | 1 + arch/powerpc/sysdev/xics/ics-opal.c | 4 +- arch/powerpc/sysdev/xics/ics-rtas.c | 4 +- arch/powerpc/sysdev/xics/xics-common.c | 59 +++++++++++++++++++++++---- 6 files changed, 63 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index f5f729c11578..f0b238516e9b 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -159,6 +159,8 @@ extern void xics_teardown_cpu(void); extern void xics_kexec_teardown_cpu(int secondary); extern void xics_migrate_irqs_away(void); extern void icp_native_eoi(struct irq_data *d); +extern int xics_set_irq_type(struct irq_data *d, unsigned int flow_type); +extern int xics_retrigger(struct irq_data *data); #ifdef CONFIG_SMP extern int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask, unsigned int strict_check); diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index e505223b4ec5..ed8bba68a162 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -228,7 +228,8 @@ int __init opal_event_init(void) } /* Install interrupt handler */ - rc = request_irq(virq, opal_interrupt, 0, "opal", NULL); + rc = request_irq(virq, opal_interrupt, IRQF_TRIGGER_LOW, + "opal", NULL); if (rc) { irq_dispose_mapping(virq); pr_warn("Error %d requesting irq %d (0x%x)\n", diff --git a/arch/powerpc/sysdev/xics/Kconfig b/arch/powerpc/sysdev/xics/Kconfig index 0031eda320c3..385e7aa9e273 100644 --- a/arch/powerpc/sysdev/xics/Kconfig +++ b/arch/powerpc/sysdev/xics/Kconfig @@ -1,6 +1,7 @@ config PPC_XICS def_bool n select PPC_SMP_MUXED_IPI + select HARDIRQS_SW_RESEND config PPC_ICP_NATIVE def_bool n diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index 27c936c080a6..1c6bf4b66f56 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -156,7 +156,9 @@ static struct irq_chip ics_opal_irq_chip = { .irq_mask = ics_opal_mask_irq, .irq_unmask = ics_opal_unmask_irq, .irq_eoi = NULL, /* Patched at init time */ - .irq_set_affinity = ics_opal_set_affinity + .irq_set_affinity = ics_opal_set_affinity, + .irq_set_type = xics_set_irq_type, + .irq_retrigger = xics_retrigger, }; static int ics_opal_map(struct ics *ics, unsigned int virq); diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 3854dd41558d..78ee5c778ef8 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -163,7 +163,9 @@ static struct irq_chip ics_rtas_irq_chip = { .irq_mask = ics_rtas_mask_irq, .irq_unmask = ics_rtas_unmask_irq, .irq_eoi = NULL, /* Patched at init time */ - .irq_set_affinity = ics_rtas_set_affinity + .irq_set_affinity = ics_rtas_set_affinity, + .irq_set_type = xics_set_irq_type, + .irq_retrigger = xics_retrigger, }; static int ics_rtas_map(struct ics *ics, unsigned int virq) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index a795a5f0301c..9d530f479588 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -328,8 +328,12 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); - /* They aren't all level sensitive but we just don't really know */ - irq_set_status_flags(virq, IRQ_LEVEL); + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. The device-tree parsing will turn the LSIs + * back to level. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); /* Don't call into ICS for IPIs */ if (hw == XICS_IPI) { @@ -351,13 +355,54 @@ static int xics_host_xlate(struct irq_domain *h, struct device_node *ct, irq_hw_number_t *out_hwirq, unsigned int *out_flags) { - /* Current xics implementation translates everything - * to level. It is not technically right for MSIs but this - * is irrelevant at this point. We might get smarter in the future - */ *out_hwirq = intspec[0]; - *out_flags = IRQ_TYPE_LEVEL_LOW; + /* + * If intsize is at least 2, we look for the type in the second cell, + * we assume the LSB indicates a level interrupt. + */ + if (intsize > 1) { + if (intspec[1] & 1) + *out_flags = IRQ_TYPE_LEVEL_LOW; + else + *out_flags = IRQ_TYPE_EDGE_RISING; + } else + *out_flags = IRQ_TYPE_LEVEL_LOW; + + return 0; +} + +int xics_set_irq_type(struct irq_data *d, unsigned int flow_type) +{ + /* + * We only support these. This has really no effect other than setting + * the corresponding descriptor bits mind you but those will in turn + * affect the resend function when re-enabling an edge interrupt. + * + * Set set the default to edge as explained in map(). + */ + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) + flow_type = IRQ_TYPE_EDGE_RISING; + + if (flow_type != IRQ_TYPE_EDGE_RISING && + flow_type != IRQ_TYPE_LEVEL_LOW) + return -EINVAL; + + irqd_set_trigger_type(d, flow_type); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +int xics_retrigger(struct irq_data *data) +{ + /* + * We need to push a dummy CPPR when retriggering, since the subsequent + * EOI will try to pop it. Passing 0 works, as the function hard codes + * the priority value anyway. + */ + xics_push_cppr(0); + + /* Tell the core to do a soft retrigger */ return 0; } -- cgit v1.2.2 From 54a94fcf2f00c9deb9491d5ab2b5a4022332f47e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Aug 2016 08:37:03 +0300 Subject: powerpc/cell: Add missing error code in spufs_mkgang() We should return -ENOMEM if alloc_spu_gang() fails. Signed-off-by: Dan Carpenter Acked-by: Arnd Bergmann Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/spufs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 5be15cff758d..2975754c65ea 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -496,8 +496,10 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode) gang = alloc_spu_gang(); SPUFS_I(inode)->i_ctx = NULL; SPUFS_I(inode)->i_gang = gang; - if (!gang) + if (!gang) { + ret = -ENOMEM; goto out_iput; + } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; -- cgit v1.2.2 From 4d9021957b5218310e28767f25551ca8f5eac797 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 3 Aug 2016 18:40:45 +1000 Subject: powerpc/powernv/ioda: Fix TCE invalidate to work in real mode again Commit fd141d1a99a3 ("powerpc/powernv/pci: Rework accessing the TCE invalidate register") broke TCE invalidation on IODA2/PHB3 for real mode. This makes invalidate work again. Fixes: fd141d1a99a3 ("powerpc/powernv/pci: Rework accessing the TCE invalidate register") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6b9528307f62..dedbbe9785ba 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1877,7 +1877,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, unsigned shift, unsigned long index, unsigned long npages) { - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ -- cgit v1.2.2 From e325d76f8bd2d222a1f577aba00dfb43cece4cbc Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Fri, 5 Aug 2016 19:13:12 +0530 Subject: powerpc/powernv: Load correct TOC pointer while waking up from winkle. The function pnv_restore_hyp_resource() loads the TOC into r2 from the invalid PACA pointer before fixing r13 value. This do not affect POWER ISA 3.0 but it does have an impact on POWER ISA 2.07 or less leading CPU to get stuck forever. login: [ 471.830433] Processor 120 is stuck. This can be easily reproducible using following steps: - Turn off SMT $ ppc64_cpu --smt=off - offline/online any online cpu (Thread 0 of any core which is online) $ echo 0 > /sys/devices/system/cpu/cpu/online $ echo 1 > /sys/devices/system/cpu/cpu/online For POWER ISA 2.07 or less, the last bit of HSPRG0 is set indicating that thread is waking up from winkle. Hence, the last bit of HSPRG0(r13) needs to be clear before accessing it as PACA to avoid loading invalid values from invalid PACA pointer. Fix this by loading TOC after r13 register is corrected. Fixes: bcef83a00dc4 ("powerpc/powernv: Add platform support for stop instruction") Signed-off-by: Mahesh Salgaonkar Acked-by: Vaidyanathan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index ba79d15f4ddd..07a330ed7022 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -363,8 +363,8 @@ _GLOBAL(power9_idle_stop) * cr3 - set to gt if waking up with partial/complete hypervisor state loss */ _GLOBAL(pnv_restore_hyp_resource) - ld r2,PACATOC(r13); BEGIN_FTR_SECTION + ld r2,PACATOC(r13); /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state @@ -395,6 +395,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) */ clrldi r5,r13,63 clrrdi r13,r13,1 + + /* Now that we are sure r13 is corrected, load TOC */ + ld r2,PACATOC(r13); cmpwi cr4,r5,1 mtspr SPRN_HSPRG0,r13 -- cgit v1.2.2 From 98d8821a47f3fd7354d3ab87adb50b10c9adb937 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Fri, 5 Aug 2016 17:34:04 +0530 Subject: powerpc/powernv: Move IDLE_STATE_ENTER_SEQ macro to cpuidle.h Move IDLE_STATE_ENTER_SEQ macro to cpuidle.h so that MCE handler changes in subsequent patch can use it. No functionality change. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cpuidle.h | 13 +++++++++++++ arch/powerpc/kernel/idle_book3s.S | 12 ------------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 3d7fc06532a1..01b8a13f0224 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -19,4 +19,17 @@ extern u64 pnv_first_deep_stop_state; #endif +/* Idle state entry routines */ +#ifdef CONFIG_PPC_P7_NAP +#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ + std r0,0(r1); \ + ptesync; \ + ld r0,0(r1); \ +1: cmp cr0,r0,r0; \ + bne 1b; \ + IDLE_INST; \ + b . +#endif /* CONFIG_PPC_P7_NAP */ + #endif diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 07a330ed7022..2265c6398a17 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -44,18 +44,6 @@ PSSCR_PSLL_MASK | PSSCR_TR_MASK | \ PSSCR_MTL_MASK -/* Idle state entry routines */ - -#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ - /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ - std r0,0(r1); \ - ptesync; \ - ld r0,0(r1); \ -1: cmp cr0,r0,r0; \ - bne 1b; \ - IDLE_INST; \ - b . - .text /* -- cgit v1.2.2 From bc14c49195e49b3231c01e4c44e3e5456c940b94 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Fri, 5 Aug 2016 17:34:13 +0530 Subject: powerpc/powernv: Fix MCE handler to avoid trashing CR0/CR1 registers. The current implementation of MCE early handling modifies CR0/1 registers without saving its old values. Fix this by moving early check for powersaving mode to machine_check_handle_early(). The power architecture 2.06 or later allows the possibility of getting machine check while in nap/sleep/winkle. The last bit of HSPRG0 is set to 1, if thread is woken up from winkle. Hence, clear the last bit of HSPRG0 (r13) before MCE handler starts using it as paca pointer. Also, the current code always puts the thread into nap state irrespective of whatever idle state it woke up from. Fix that by looking at paca->thread_idle_state and put the thread back into same state where it came from. Fixes: 1c51089f777b ("powerpc/book3s: Return from interrupt if coming from evil context.") Reported-by: Paul Mackerras Signed-off-by: Mahesh Salgaonkar Reviewed-by: Shreyas B. Prabhu Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 69 +++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 41091fdf9bd8..df6d45eb4115 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -144,29 +144,14 @@ machine_check_pSeries_1: * vector */ SET_SCRATCH0(r13) /* save r13 */ -#ifdef CONFIG_PPC_P7_NAP -BEGIN_FTR_SECTION - /* Running native on arch 2.06 or later, check if we are - * waking up from nap. We only handle no state loss and - * supervisor state loss. We do -not- handle hypervisor - * state loss at this time. + /* + * Running native on arch 2.06 or later, we may wakeup from winkle + * inside machine check. If yes, then last bit of HSPGR0 would be set + * to 1. Hence clear it unconditionally. */ - mfspr r13,SPRN_SRR1 - rlwinm. r13,r13,47-31,30,31 - OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) - beq 9f - - mfspr r13,SPRN_SRR1 - rlwinm. r13,r13,47-31,30,31 - /* waking up from powersave (nap) state */ - cmpwi cr1,r13,2 - /* Total loss of HV state is fatal. let's just stay stuck here */ - OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) - bgt cr1,. -9: - OPT_SET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) -#endif /* CONFIG_PPC_P7_NAP */ + GET_PACA(r13) + clrrdi r13,r13,1 + SET_PACA(r13) EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION b machine_check_powernv_early @@ -1273,25 +1258,51 @@ machine_check_handle_early: * Check if thread was in power saving mode. We come here when any * of the following is true: * a. thread wasn't in power saving mode - * b. thread was in power saving mode with no state loss or - * supervisor state loss + * b. thread was in power saving mode with no state loss, + * supervisor state loss or hypervisor state loss. * - * Go back to nap again if (b) is true. + * Go back to nap/sleep/winkle mode again if (b) is true. */ rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ beq 4f /* No, it wasn;t */ /* Thread was in power saving mode. Go back to nap again. */ cmpwi r11,2 - bne 3f - /* Supervisor state loss */ + blt 3f + /* Supervisor/Hypervisor state loss */ li r0,1 stb r0,PACA_NAPSTATELOST(r13) 3: bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP GET_PACA(r13) ld r1,PACAR1(r13) - li r3,PNV_THREAD_NAP - b pnv_enter_arch207_idle_mode + /* + * Check what idle state this CPU was in and go back to same mode + * again. + */ + lbz r3,PACA_THREAD_IDLE_STATE(r13) + cmpwi r3,PNV_THREAD_NAP + bgt 10f + IDLE_STATE_ENTER_SEQ(PPC_NAP) + /* No return */ +10: + cmpwi r3,PNV_THREAD_SLEEP + bgt 2f + IDLE_STATE_ENTER_SEQ(PPC_SLEEP) + /* No return */ + +2: + /* + * Go back to winkle. Please note that this thread was woken up in + * machine check from winkle and have not restored the per-subcore + * state. Hence before going back to winkle, set last bit of HSPGR0 + * to 1. This will make sure that if this thread gets woken up + * again at reset vector 0x100 then it will get chance to restore + * the subcore state. + */ + ori r13,r13,1 + SET_PACA(r13) + IDLE_STATE_ENTER_SEQ(PPC_WINKLE) + /* No return */ 4: #endif /* -- cgit v1.2.2 From 9dc512819e4b723856773f5a018865c20441072f Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 1 Aug 2016 16:32:51 +1000 Subject: powerpc: Fix unused function warning 'lmb_to_memblock' This patch fixes the following warning: arch/powerpc/platforms/pseries/hotplug-memory.c:323:29: error: 'lmb_to_memblock' defined but not used [-Werror=unused-function] static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) ^~~~~~~~~~~~~~~ The only consumer of this function is 'dlpar_remove_lmb', which is enabled with CONFIG_MEMORY_HOTREMOVE, so move it into the same ifdef block. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/hotplug-memory.c | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 43f7beb2902d..76ec104e88be 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -320,19 +320,6 @@ static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb) return dlpar_update_device_tree_lmb(lmb); } -static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) -{ - unsigned long section_nr; - struct mem_section *mem_sect; - struct memory_block *mem_block; - - section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - mem_sect = __nr_to_section(section_nr); - - mem_block = find_memory_block(mem_sect); - return mem_block; -} - #ifdef CONFIG_MEMORY_HOTREMOVE static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) { @@ -420,6 +407,19 @@ static bool lmb_is_removable(struct of_drconf_cell *lmb) static int dlpar_add_lmb(struct of_drconf_cell *); +static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) +{ + unsigned long section_nr; + struct mem_section *mem_sect; + struct memory_block *mem_block; + + section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); + mem_sect = __nr_to_section(section_nr); + + mem_block = find_memory_block(mem_sect); + return mem_block; +} + static int dlpar_remove_lmb(struct of_drconf_cell *lmb) { struct memory_block *mem_block; -- cgit v1.2.2 From 546c4402c57497a6ffdf5373aff75bc89f0866bc Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 6 Aug 2016 15:54:12 -0700 Subject: powerpc/vdso: Add missing include file Some powerpc builds fail with the following buld error. In file included from ./arch/powerpc/include/asm/mmu_context.h:11:0, from arch/powerpc/kernel/vdso.c:28: arch/powerpc/include/asm/cputhreads.h: In function 'get_tensr': arch/powerpc/include/asm/cputhreads.h:101:2: error: implicit declaration of function 'cpu_has_feature' Fixes: b92a226e5284 ("powerpc: Move cpu_has_feature() to a separate file") Signed-off-by: Guenter Roeck Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vdso.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 6767605ea8da..4111d30badfa 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include -- cgit v1.2.2 From 10560b9afc8abf349843dff88c45dd43223e803e Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 22 Jul 2016 14:05:29 -0300 Subject: powerpc/eeh: Switch to conventional PCI address output in EEH log This is a very minor/trivial fix for the output of PCI address on EEH logs. The PCI address on "OF node" field currently is using ":" as a separator for the function, but the usual separator is ".". This patch changes the separator to dot, so the PCI address is printed as usual. Signed-off-by: Guilherme G. Piccoli Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c9bc78e9c610..7429556eb8df 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -168,10 +168,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) int n = 0, l = 0; char buffer[128]; - n += scnprintf(buf+n, len-n, "%04x:%02x:%02x:%01x\n", + n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", edev->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); - pr_warn("EEH: of node=%04x:%02x:%02x:%01x\n", + pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", edev->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); -- cgit v1.2.2 From 61e8a0d5a0270b91581f6c715036844b2ea98da1 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 5 Aug 2016 16:40:56 +1000 Subject: powerpc/pci: Fix endian bug in fixed PHB numbering The recent commit 63a72284b159 ("powerpc/pci: Assign fixed PHB number based on device-tree properties"), added code to read a 64-bit property from the device tree, and if not found read a 32-bit property (reg). There was a bug in the 32-bit case, on big endian machines, due to the use of the 64-bit value to read the 32-bit property. The cast of &prop means we end up writing to the high 32-bit of prop, leaving the low 32-bits containing whatever junk was on the stack. If that junk value was non-zero, and < MAX_PHBS, we would end up using it as the PHB id. This results in users seeing what appear to be random PHB ids. Fix it by reading into a u32 property and then assigning that to the u64 value, letting the CPU do the correct conversions for us. Fixes: 63a72284b159 ("powerpc/pci: Assign fixed PHB number based on device-tree properties") Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/pci-common.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index a5c0153ede37..7fdf324d5b51 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(get_pci_dma_ops); static int get_phb_number(struct device_node *dn) { int ret, phb_id = -1; + u32 prop_32; u64 prop; /* @@ -86,8 +87,10 @@ static int get_phb_number(struct device_node *dn) * reading "ibm,opal-phbid", only present in OPAL environment. */ ret = of_property_read_u64(dn, "ibm,opal-phbid", &prop); - if (ret) - ret = of_property_read_u32_index(dn, "reg", 1, (u32 *)&prop); + if (ret) { + ret = of_property_read_u32_index(dn, "reg", 1, &prop_32); + prop = prop_32; + } if (!ret) phb_id = (int)(prop & (MAX_PHBS - 1)); -- cgit v1.2.2 From c74dd88e77d3ecbc9e55c78796d82c9aa21cabad Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 9 Aug 2016 10:39:13 +0530 Subject: powerpc/book3s: Fix MCE console messages for unrecoverable MCE. When machine check occurs with MSR(RI=0), it means MC interrupt is unrecoverable and kernel goes down to panic path. But the console message still shows it as recovered. This patch fixes the MCE console messages. Fixes: 36df96f8acaf ("powerpc/book3s: Decode and save machine check event.") Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce.c | 3 ++- arch/powerpc/platforms/powernv/opal.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index ef267fd9dd22..5e7ece0fda9f 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -92,7 +92,8 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->in_use = 1; mce->initiator = MCE_INITIATOR_CPU; - if (handled) + /* Mark it recovered if we have handled it and MSR(RI=1). */ + if (handled && (regs->msr & MSR_RI)) mce->disposition = MCE_DISPOSITION_RECOVERED; else mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 8b4fc68cebcb..6c9a65b52e63 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -399,6 +399,7 @@ static int opal_recover_mce(struct pt_regs *regs, if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ + pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ -- cgit v1.2.2 From 5958d19a143eb229e9ece20bd4c781ad41cb7d24 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 8 Jul 2016 15:55:43 +1000 Subject: powerpc/pnv/pci: Fix incorrect PE reservation attempt on some 64-bit BARs The generic allocation code may sometimes decide to assign a prefetchable 64-bit BAR to the M32 window. In fact it may also decide to allocate a 64-bit non-prefetchable BAR to the M64 one ! So using the resource flags as a test to decide which window was used for PE allocation is just wrong and leads to insane PE numbers. Instead, compare the addresses to figure it out. Signed-off-by: Benjamin Herrenschmidt [mpe: Rename the function as agreed by Ben & Gavin] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index dedbbe9785ba..fd9444f9fb0c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -111,10 +111,17 @@ static int __init iommu_setup(char *str) } early_param("iommu", iommu_setup); -static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) +static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) { - return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == - (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); + /* + * WARNING: We cannot rely on the resource flags. The Linux PCI + * allocation code sometimes decides to put a 64-bit prefetchable + * BAR in the 32-bit window, so we have to compare the addresses. + * + * For simplicity we only test resource start. + */ + return (r->start >= phb->ioda.m64_base && + r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); } static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) @@ -229,7 +236,7 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, sgsz = phb->ioda.m64_segsize; for (i = 0; i <= PCI_ROM_RESOURCE; i++) { r = &pdev->resource[i]; - if (!r->parent || !pnv_pci_is_mem_pref_64(r->flags)) + if (!r->parent || !pnv_pci_is_m64(phb, r)) continue; start = _ALIGN_DOWN(r->start - base, sgsz); @@ -2863,7 +2870,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) res = &pdev->resource[i + PCI_IOV_RESOURCES]; if (!res->flags || res->parent) continue; - if (!pnv_pci_is_mem_pref_64(res->flags)) { + if (!pnv_pci_is_m64(phb, res)) { dev_warn(&pdev->dev, "Don't support SR-IOV with" " non M64 VF BAR%d: %pR. \n", i, res); @@ -2958,7 +2965,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, index++; } } else if ((res->flags & IORESOURCE_MEM) && - !pnv_pci_is_mem_pref_64(res->flags)) { + !pnv_pci_is_m64(phb, res)) { region.start = res->start - phb->hose->mem_offset[0] - phb->ioda.m32_pci_base; @@ -3083,9 +3090,12 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, bridge = bridge->bus->self; } - /* We fail back to M32 if M64 isn't supported */ - if (phb->ioda.m64_segsize && - pnv_pci_is_mem_pref_64(type)) + /* + * We fall back to M32 if M64 isn't supported. We enforce the M64 + * alignment for any 64-bit resource, PCIe doesn't care and + * bridges only do 64-bit prefetchable anyway. + */ + if (phb->ioda.m64_segsize && (type & IORESOURCE_MEM_64)) return phb->ioda.m64_segsize; if (type & IORESOURCE_MEM) return phb->ioda.m32_segsize; @@ -3125,7 +3135,7 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, w = NULL; if (r->flags & type & IORESOURCE_IO) w = &hose->io_resource; - else if (pnv_pci_is_mem_pref_64(r->flags) && + else if (pnv_pci_is_m64(phb, r) && (type & IORESOURCE_PREFETCH) && phb->ioda.m64_segsize) w = &hose->mem_resources[1]; -- cgit v1.2.2 From 50ee91bdef41c15b671dcd9446ee007a1d2f5ab7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 9 Aug 2016 10:30:49 +0800 Subject: arm64: Support hard limit of cpu count by nr_cpus Enable the hard limit of cpu count by set boot options nr_cpus=x on arm64, and make a minor change about message when total number of cpu exceeds the limit. Reviewed-by: Suzuki K Poulose Reported-by: Shiyuan Hu Signed-off-by: Kefeng Wang Signed-off-by: Will Deacon --- arch/arm64/kernel/smp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 76a6d9263908..d93d43352504 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -661,9 +661,9 @@ void __init smp_init_cpus(void) acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, acpi_parse_gic_cpu_interface, 0); - if (cpu_count > NR_CPUS) - pr_warn("no. of cores (%d) greater than configured maximum of %d - clipping\n", - cpu_count, NR_CPUS); + if (cpu_count > nr_cpu_ids) + pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n", + cpu_count, nr_cpu_ids); if (!bootcpu_valid) { pr_err("missing boot CPU MPIDR, not enabling secondaries\n"); @@ -677,7 +677,7 @@ void __init smp_init_cpus(void) * with entries in cpu_logical_map while initializing the cpus. * If the cpu set-up fails, invalidate the cpu_logical_map entry. */ - for (i = 1; i < NR_CPUS; i++) { + for (i = 1; i < nr_cpu_ids; i++) { if (cpu_logical_map(i) != INVALID_HWID) { if (smp_cpu_setup(i)) cpu_logical_map(i) = INVALID_HWID; -- cgit v1.2.2 From 3b2fbb3f06efe5bd2dfdce2a1db703e23c1a78af Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 7 Jul 2016 17:07:57 +0200 Subject: virtio/s390: deprecate old transport There only ever have been two host implementations of the old s390-virtio (pre-ccw) transport: the experimental kuli userspace, and qemu. As qemu switched its default to ccw with 2.4 (with most users having used ccw well before that) and removed the old transport entirely in 2.6, s390-virtio probably hasn't been in active use for quite some time and is therefore likely to bitrot. Let's start the slow march towards removing the code by deprecating it. Note that this also deprecates the early virtio console code, which has been causing trouble in the guest without being wired up in any relevant hypervisor code. Acked-by: Christian Borntraeger Reviewed-by: Dong Jia Shi Reviewed-by: Sascha Silbe Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- arch/s390/Kconfig | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9e607bf2d640..5a2907ee8c93 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -871,4 +871,17 @@ config S390_GUEST Select this option if you want to run the kernel as a guest under the KVM hypervisor. +config S390_GUEST_OLD_TRANSPORT + def_bool y + prompt "Guest support for old s390 virtio transport (DEPRECATED)" + depends on S390_GUEST + help + Enable this option to add support for the old s390-virtio + transport (i.e. virtio devices NOT based on virtio-ccw). This + type of virtio devices is only available on the experimental + kuli userspace or with old (< 2.6) qemu. If you are running + with a modern version of qemu (which supports virtio-ccw since + 1.4 and uses it by default since version 2.4), you probably won't + need this. + endmenu -- cgit v1.2.2 From 97b1d23f7bcbee00a5b85bf2c022c612ea67429d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 5 Aug 2016 12:08:36 +0100 Subject: metag: Drop show_mem() from mem_init() The recent commit 599d0c954f91 ("mm, vmscan: move LRU lists to node"), changed memory management code so that show_mem() is no longer safe to call prior to setup_per_cpu_pageset(), as pgdat->per_cpu_nodestats will still be NULL. This causes an oops on metag due to the call to show_mem() from mem_init(): node_page_state_snapshot(...) + 0x48 pgdat_reclaimable(struct pglist_data * pgdat = 0x402517a0) show_free_areas(unsigned int filter = 0) + 0x2cc show_mem(unsigned int filter = 0) + 0x18 mem_init() mm_init() start_kernel() + 0x204 This wasn't a problem before with zone_reclaimable() as zone_pcp_init() was already setting zone->pageset to &boot_pageset, via setup_arch() and paging_init(), which happens before mm_init(): zone_pcp_init(...) free_area_init_core(...) + 0x138 free_area_init_node(int nid = 0, ...) + 0x1a0 free_area_init_nodes(...) + 0x440 paging_init(unsigned long mem_end = 0x4fe00000) + 0x378 setup_arch(char ** cmdline_p = 0x4024e038) + 0x2b8 start_kernel() + 0x54 No other arches appear to call show_mem() during boot, and it doesn't really add much value to the log, so lets just drop it from mem_init(). Signed-off-by: James Hogan Acked-by: Mel Gorman Cc: linux-metag@vger.kernel.org --- arch/metag/mm/init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 11fa51c89617..c0ec116b3993 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -390,7 +390,6 @@ void __init mem_init(void) free_all_bootmem(); mem_init_print_info(NULL); - show_mem(0); } void free_initmem(void) -- cgit v1.2.2 From 255c0397bc402e3fcc8833c214f49b2108f04d1a Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 9 Aug 2016 16:18:52 +0200 Subject: ARM: imx6: mark GPC node as not populated after irq init to probe pm domain driver Since IRQCHIP_DECLARE now flags the GPC node as already populated, the GPC power domain driver is never probed unless we clear the flag again. Fixes: 15cc2ed6dcf9 ("of/irq: Mark initialised interrupt controllers as populated") Suggested-by: Rob Herring Signed-off-by: Philipp Zabel Cc: Jon Hunter Signed-off-by: Rob Herring --- arch/arm/mach-imx/gpc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c index fd8720532471..0df062d8b2c9 100644 --- a/arch/arm/mach-imx/gpc.c +++ b/arch/arm/mach-imx/gpc.c @@ -271,6 +271,12 @@ static int __init imx_gpc_init(struct device_node *node, for (i = 0; i < IMR_NUM; i++) writel_relaxed(~0, gpc_base + GPC_IMR1 + i * 4); + /* + * Clear the OF_POPULATED flag set in of_irq_init so that + * later the GPC power domain driver will not be skipped. + */ + of_node_clear_flag(node, OF_POPULATED); + return 0; } IRQCHIP_DECLARE(imx_gpc, "fsl,imx6q-gpc", imx_gpc_init); -- cgit v1.2.2 From b9a019899f61acca18df5fb5e38a8fcdfea86fcd Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 28 Jul 2016 19:38:07 +0100 Subject: ARM: 8590/1: sanity_check_meminfo(): avoid overflow on vmalloc_limit To limit the amount of mapped low memory, we determine a physical address boundary based on the start of the vmalloc area using __pa(). Strictly speaking, the vmalloc area location is arbitrary and does not necessarily corresponds to a valid physical address. For example, if PAGE_OFFSET = 0x80000000 PHYS_OFFSET = 0x90000000 vmalloc_min = 0xf0000000 then __pa(vmalloc_min) overflows and returns a wrapped 0 when phys_addr_t is a 32-bit type. Then the code that follows determines that the entire physical memory is above that boundary and no low memory gets mapped at all: |[...] |Machine model: Freescale i.MX51 NA04 Board |Ignoring RAM at 0x90000000-0xb0000000 (!CONFIG_HIGHMEM) |Consider using a HIGHMEM enabled kernel. To avoid this problem let's make vmalloc_limit a 64-bit value all the time and determine that boundary explicitly without using __pa(). Reported-by: Emil Renner Berthing Signed-off-by: Nicolas Pitre Tested-by: Emil Renner Berthing Signed-off-by: Russell King --- arch/arm/mm/mmu.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 62f4d01941f7..12774c8e770c 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1155,10 +1155,19 @@ void __init sanity_check_meminfo(void) { phys_addr_t memblock_limit = 0; int highmem = 0; - phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1; + u64 vmalloc_limit; struct memblock_region *reg; bool should_use_highmem = false; + /* + * Let's use our own (unoptimized) equivalent of __pa() that is + * not affected by wrap-arounds when sizeof(phys_addr_t) == 4. + * The result is used as the upper bound on physical memory address + * and may itself be outside the valid range for which phys_addr_t + * and therefore __pa() is defined. + */ + vmalloc_limit = (u64)(uintptr_t)vmalloc_min - PAGE_OFFSET + PHYS_OFFSET; + for_each_memblock(memory, reg) { phys_addr_t block_start = reg->base; phys_addr_t block_end = reg->base + reg->size; @@ -1183,10 +1192,11 @@ void __init sanity_check_meminfo(void) if (reg->size > size_limit) { phys_addr_t overlap_size = reg->size - size_limit; - pr_notice("Truncating RAM at %pa-%pa to -%pa", - &block_start, &block_end, &vmalloc_limit); - memblock_remove(vmalloc_limit, overlap_size); + pr_notice("Truncating RAM at %pa-%pa", + &block_start, &block_end); block_end = vmalloc_limit; + pr_cont(" to -%pa", &block_end); + memblock_remove(vmalloc_limit, overlap_size); should_use_highmem = true; } } -- cgit v1.2.2 From 61444cde9170e256c238a02c9a4861930db04f5f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Jul 2016 19:48:44 +0100 Subject: ARM: 8591/1: mm: use fully constructed struct pages for EFI pgd allocations The late_alloc() PTE allocation function used by create_mapping_late() does not call pgtable_page_ctor() on PTE pages it allocates, leaving the per-page spinlock uninitialized. Since generic page table manipulation code may assume that translation table pages that are not owned by init_mm are covered by fully constructed struct pages, the following crash may occur with the new UEFI memory attributes table code. efi: memattr: Processing EFI Memory Attributes table: efi: memattr: 0x0000ffa16000-0x0000ffa82fff [Runtime Code |RUN| | |XP| | | | | | | | ] Unable to handle kernel NULL pointer dereference at virtual address 00000010 pgd = c0204000 [00000010] *pgd=00000000 Internal error: Oops: 5 [#1] SMP ARM Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.7.0-rc4-00063-g3882aa7b340b #361 Hardware name: Generic DT based system task: ed858000 ti: ed842000 task.ti: ed842000 PC is at __lock_acquire+0xa0/0x19a8 ... [] (__lock_acquire) from [] (lock_acquire+0x6c/0x88) [] (lock_acquire) from [] (_raw_spin_lock+0x2c/0x3c) [] (_raw_spin_lock) from [] (apply_to_page_range+0xe8/0x238) [] (apply_to_page_range) from [] (efi_set_mapping_permissions+0x54/0x5c) [] (efi_set_mapping_permissions) from [] (efi_memattr_apply_permissions+0x2b8/0x378) [] (efi_memattr_apply_permissions) from [] (arm_enable_runtime_services+0x1f0/0x22c) [] (arm_enable_runtime_services) from [] (do_one_initcall+0x44/0x174) [] (do_one_initcall) from [] (kernel_init_freeable+0x90/0x1e8) [] (kernel_init_freeable) from [] (kernel_init+0x8/0x114) [] (kernel_init) from [] (ret_from_fork+0x14/0x24) The crash is due to the fact that the UEFI page tables are not owned by init_mm, but are not covered by fully constructed struct pages. Given that the UEFI subsystem is currently the only user of create_mapping_late(), add an unconditional call to pgtable_page_ctor() to late_alloc(). Fixes: 9fc68b717c24 ("ARM/efi: Apply strict permissions for UEFI Runtime Services regions") Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/mm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 12774c8e770c..6344913f0804 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -728,7 +728,8 @@ static void *__init late_alloc(unsigned long sz) { void *ptr = (void *)__get_free_pages(PGALLOC_GFP, get_order(sz)); - BUG_ON(!ptr); + if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + BUG(); return ptr; } -- cgit v1.2.2 From 87eed3c74d7c65556f744230a90bf9556dd29146 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Aug 2016 10:33:35 +0100 Subject: ARM: fix address limit restoration for undefined instructions During boot, sometimes the kernel will test to see if an instruction causes an undefined instruction exception. Unfortunately, the exit path for these exceptions did not restore the address limit, which causes the rootfs mount code to fail. Fix the missing address limit restoration. Tested-by: Guenter Roeck Signed-off-by: Russell King --- arch/arm/kernel/entry-armv.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index bc5f50799d75..9f157e7c51e7 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -295,6 +295,7 @@ __und_svc_fault: bl __und_fault __und_svc_finish: + get_thread_info tsk ldr r5, [sp, #S_PSR] @ Get SVC cpsr svc_exit r5 @ return from exception UNWIND(.fnend ) -- cgit v1.2.2 From 1bc8b816cb8058c31f61fe78442f10a43209e582 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Aug 2016 10:07:05 +0200 Subject: powerpc/32: Fix csum_partial_copy_generic() Commit 7aef4136566b0 ("powerpc32: rewrite csum_partial_copy_generic() based on copy_tofrom_user()") introduced a bug when destination address is odd and initial csum is not null In that (rare) case the initial csum value has to be rotated one byte as well as the resulting value is This patch also fixes related comments Fixes: 7aef4136566b0 ("powerpc32: rewrite csum_partial_copy_generic() based on copy_tofrom_user()") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/lib/checksum_32.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S index d90870a66b60..0a57fe6d49cc 100644 --- a/arch/powerpc/lib/checksum_32.S +++ b/arch/powerpc/lib/checksum_32.S @@ -127,8 +127,9 @@ _GLOBAL(csum_partial_copy_generic) stw r7,12(r1) stw r8,8(r1) - andi. r0,r4,1 /* is destination address even ? */ - cmplwi cr7,r0,0 + rlwinm r0,r4,3,0x8 + rlwnm r6,r6,r0,0,31 /* odd destination address: rotate one byte */ + cmplwi cr7,r0,0 /* is destination address even ? */ addic r12,r6,0 addi r6,r4,-4 neg r0,r4 @@ -237,7 +238,7 @@ _GLOBAL(csum_partial_copy_generic) 66: addze r3,r12 addi r1,r1,16 beqlr+ cr7 - rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */ + rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */ blr /* read fault */ -- cgit v1.2.2 From c7a318ba868c61fc9be710a4970172d8c2eeb8b9 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Wed, 10 Aug 2016 15:44:46 +1000 Subject: powerpc/ptrace: Fix coredump since ptrace TM changes Commit 8d460f6156cd ("powerpc/process: Add the function flush_tmregs_to_thread") added flush_tmregs_to_thread() and included the assumption that it would only be called for a task which is not current. Although this is correct for ptrace, when generating a core dump, some of the routines which call flush_tmregs_to_thread() are called. This leads to a WARNing such as: Not expecting ptrace on self: TM regs may be incorrect ------------[ cut here ]------------ WARNING: CPU: 123 PID: 7727 at arch/powerpc/kernel/process.c:1088 flush_tmregs_to_thread+0x78/0x80 CPU: 123 PID: 7727 Comm: libvirtd Not tainted 4.8.0-rc1-gcc6x-g61e8a0d #1 task: c000000fe631b600 task.stack: c000000fe63b0000 NIP: c00000000001a1a8 LR: c00000000001a1a4 CTR: c000000000717780 REGS: c000000fe63b3420 TRAP: 0700 Not tainted (4.8.0-rc1-gcc6x-g61e8a0d) MSR: 900000010282b033 CR: 28004222 XER: 20000000 ... NIP [c00000000001a1a8] flush_tmregs_to_thread+0x78/0x80 LR [c00000000001a1a4] flush_tmregs_to_thread+0x74/0x80 Call Trace: flush_tmregs_to_thread+0x74/0x80 (unreliable) vsr_get+0x64/0x1a0 elf_core_dump+0x604/0x1430 do_coredump+0x5fc/0x1200 get_signal+0x398/0x740 do_signal+0x54/0x2b0 do_notify_resume+0x98/0xb0 ret_from_except_lite+0x70/0x74 So fix flush_tmregs_to_thread() to detect the case where it is called on current, and a transaction is active, and in that case flush the TM regs to the thread_struct. This patch also moves flush_tmregs_to_thread() into ptrace.c as it is only called from that file. Fixes: 8d460f6156cd ("powerpc/process: Add the function flush_tmregs_to_thread") Signed-off-by: Cyril Bur [mpe: Flesh out change log] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/switch_to.h | 8 -------- arch/powerpc/kernel/process.c | 20 -------------------- arch/powerpc/kernel/ptrace.c | 19 +++++++++++++++++++ 3 files changed, 19 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 0a74ebe934e1..17c8380673a6 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -75,14 +75,6 @@ static inline void disable_kernel_spe(void) static inline void __giveup_spe(struct task_struct *t) { } #endif -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -extern void flush_tmregs_to_thread(struct task_struct *); -#else -static inline void flush_tmregs_to_thread(struct task_struct *t) -{ -} -#endif - static inline void clear_task_ebb(struct task_struct *t) { #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 58ccf86415b4..9ee2623e0f67 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1074,26 +1074,6 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -void flush_tmregs_to_thread(struct task_struct *tsk) -{ - /* - * Process self tracing is not yet supported through - * ptrace interface. Ptrace generic code should have - * prevented this from happening in the first place. - * Warn once here with the message, if some how it - * is attempted. - */ - WARN_ONCE(tsk == current, - "Not expecting ptrace on self: TM regs may be incorrect\n"); - - /* - * If task is not current, it should have been flushed - * already to it's thread_struct during __switch_to(). - */ -} -#endif - struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 4f3c5756cc09..bf91658a8a40 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -38,6 +38,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -118,6 +119,24 @@ static const struct pt_regs_offset regoffset_table[] = { REG_OFFSET_END, }; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static void flush_tmregs_to_thread(struct task_struct *tsk) +{ + /* + * If task is not current, it will have been flushed already to + * it's thread_struct during __switch_to(). + * + * A reclaim flushes ALL the state. + */ + + if (tsk == current && MSR_TM_SUSPENDED(mfmsr())) + tm_reclaim_current(TM_CAUSE_SIGNAL); + +} +#else +static inline void flush_tmregs_to_thread(struct task_struct *tsk) { } +#endif + /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register -- cgit v1.2.2 From 4d81aaa53c2dea220ddf88e19c33033d6cf4f8cb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 Aug 2016 12:26:28 +0200 Subject: s390/pageattr: handle numpages parameter correctly Both set_memory_ro() and set_memory_rw() will modify the page attributes of at least one page, even if the numpages parameter is zero. The author expected that calling these functions with numpages == zero would never happen. However with the new 444d13ff10fb ("modules: add ro_after_init support") feature this happens frequently. Therefore do the right thing and make these two functions return gracefully if nothing should be done. Fixes crashes on module load like this one: Unable to handle kernel pointer dereference in virtual kernel address space Failing address: 000003ff80008000 TEID: 000003ff80008407 Fault in home space mode while using kernel ASCE. AS:0000000000d18007 R3:00000001e6aa4007 S:00000001e6a10800 P:00000001e34ee21d Oops: 0004 ilc:3 [#1] SMP Modules linked in: x_tables CPU: 10 PID: 1 Comm: systemd Not tainted 4.7.0-11895-g3fa9045 #4 Hardware name: IBM 2964 N96 703 (LPAR) task: 00000001e9118000 task.stack: 00000001e9120000 Krnl PSW : 0704e00180000000 00000000005677f8 (rb_erase+0xf0/0x4d0) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3 Krnl GPRS: 000003ff80008b20 000003ff80008b20 000003ff80008b70 0000000000b9d608 000003ff80008b20 0000000000000000 00000001e9123e88 000003ff80008950 00000001e485ab40 000003ff00000000 000003ff80008b00 00000001e4858480 0000000100000000 000003ff80008b68 00000000001d5998 00000001e9123c28 Krnl Code: 00000000005677e8: ec1801c3007c cgij %r1,0,8,567b6e 00000000005677ee: e32010100020 cg %r2,16(%r1) #00000000005677f4: a78401c2 brc 8,567b78 >00000000005677f8: e35010080024 stg %r5,8(%r1) 00000000005677fe: ec5801af007c cgij %r5,0,8,567b5c 0000000000567804: e30050000024 stg %r0,0(%r5) 000000000056780a: ebacf0680004 lmg %r10,%r12,104(%r15) 0000000000567810: 07fe bcr 15,%r14 Call Trace: ([<000003ff80008900>] __this_module+0x0/0xffffffffffffd700 [x_tables]) ([<0000000000264fd4>] do_init_module+0x12c/0x220) ([<00000000001da14a>] load_module+0x24e2/0x2b10) ([<00000000001da976>] SyS_finit_module+0xbe/0xd8) ([<0000000000803b26>] system_call+0xd6/0x264) Last Breaking-Event-Address: [<000000000056771a>] rb_erase+0x12/0x4d0 Kernel panic - not syncing: Fatal exception: panic_on_oops Reported-by: Christian Borntraeger Reported-and-tested-by: Sebastian Ott Fixes: e8a97e42dc98 ("s390/pageattr: allow kernel page table splitting") Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pageattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 7104ffb5a67f..af7cf28cf97e 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -252,6 +252,8 @@ static int change_page_attr(unsigned long addr, unsigned long end, int rc = -EINVAL; pgd_t *pgdp; + if (addr == end) + return 0; if (end >= MODULES_END) return -EINVAL; mutex_lock(&cpa_mutex); -- cgit v1.2.2 From 7d70c63c7132eb95e428e945242360233ab23d82 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 10 Aug 2016 17:29:29 +1000 Subject: powerpc: Print the kernel load address at the end of prom_init() This makes it easier to debug crashes that happen very early before the kernel takes over Open Firmware by allowing us to relate the OF reported crashing addresses to offsets within the kernel. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 6ee4b72cda42..4e74fc588a3f 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2940,7 +2940,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* Don't print anything after quiesce under OPAL, it crashes OFW */ if (of_platform != PLATFORM_OPAL) { - prom_printf("Booting Linux via __start() ...\n"); + prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); prom_debug("->dt_header_start=0x%x\n", hdr); } -- cgit v1.2.2 From f9cc1d1f808dbdfd56978259d262b879504efaed Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 10 Aug 2016 17:32:38 +1000 Subject: powerpc: Update obsolete comment in setup_32.c about early_init() We don't identify the machine type anymore... Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_32.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index c3e861df4b20..f10a975baa5e 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -93,10 +93,8 @@ notrace unsigned long __init early_init(unsigned long dt_ptr) * and we are running with enough of the MMU enabled to have our * proper kernel virtual addresses * - * Find out what kind of machine we're on and save any data we need - * from the early boot process (devtree is copied on pmac by prom_init()). - * This is called very early on the boot process, after a minimal - * MMU environment has been set up but before MMU_init is called. + * We do the initial parsing of the flat device-tree and prepares + * for the MMU to be fully initialized. */ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ -- cgit v1.2.2 From 97f6e0cc35026a2a09147a6da636d901525e1969 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 10 Aug 2016 17:27:34 +1000 Subject: powerpc/32: Fix crash during static key init We cannot do those initializations from apply_feature_fixups() as this function runs in a very restricted environment on 32-bit where the kernel isn't running at its linked address and the PTRRELOC() macro must be used for any global accesss. Instead, split them into a separtate steup_feature_keys() function which is called in a more suitable spot on ppc32. Fixes: 309b315b6ec6 ("powerpc: Call jump_label_init() in apply_feature_fixups()") Reported-and-tested-by: Christian Kujau Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/feature-fixups.h | 1 + arch/powerpc/kernel/setup_32.c | 3 +++ arch/powerpc/kernel/setup_64.c | 1 + arch/powerpc/lib/feature-fixups.c | 3 +++ 4 files changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 57fec8ac7b92..ddf54f5bbdd1 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -186,6 +186,7 @@ label##3: \ #ifndef __ASSEMBLY__ void apply_feature_fixups(void); +void setup_feature_keys(void); #endif #endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */ diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index f10a975baa5e..24ec3ea4b3a2 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -100,6 +100,9 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { + /* Configure static keys first, now that we're relocated. */ + setup_feature_keys(); + /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index eafb9a79e011..7ac8e6eaab5b 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -300,6 +300,7 @@ void __init early_setup(unsigned long dt_ptr) /* Apply all the dynamic patching */ apply_feature_fixups(); + setup_feature_keys(); /* Initialize the hash table or TLB handling */ early_init_mmu(); diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 74145f02ad41..043415f0bdb1 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -188,7 +188,10 @@ void __init apply_feature_fixups(void) &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup); #endif do_final_fixups(); +} +void __init setup_feature_keys(void) +{ /* * Initialise jump label. This causes all the cpu/mmu_has_feature() * checks to take on their correct polarity based on the current set of -- cgit v1.2.2 From 1a9e4c564ab174e53ed86def922804a5ddc63e7d Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 14 Jul 2016 17:22:54 +0200 Subject: x86/timers/apic: Fix imprecise timer interrupts by eliminating TSC clockevents frequency roundoff error I noticed the following bug/misbehavior on certain Intel systems: with a single task running on a NOHZ CPU on an Intel Haswell, I recognized that I did not only get the one expected local_timer APIC interrupt, but two per second at minimum. (!) Further tracing showed that the first one precedes the programmed deadline by up to ~50us and hence, it did nothing except for reprogramming the TSC deadline clockevent device to trigger shortly thereafter again. The reason for this is imprecise calibration, the timeout we program into the APIC results in 'too short' timer interrupts. The core (hr)timer code notices this (because it has a precise ktime source and sees the short interrupt) and fixes it up by programming an additional very short interrupt period. This is obviously suboptimal. The reason for the imprecise calibration is twofold, and this patch fixes the first reason: In setup_APIC_timer(), the registered clockevent device's frequency is calculated by first dividing tsc_khz by TSC_DIVISOR and multiplying it with 1000 afterwards: (tsc_khz / TSC_DIVISOR) * 1000 The multiplication with 1000 is done for converting from kHz to Hz and the division by TSC_DIVISOR is carried out in order to make sure that the final result fits into an u32. However, with the order given in this calculation, the roundoff error introduced by the division gets magnified by a factor of 1000 by the following multiplication. To fix it, reversing the order of the division and the multiplication a la: (tsc_khz * 1000) / TSC_DIVISOR ... reduces the roundoff error already. Furthermore, if TSC_DIVISOR divides 1000, associativity holds: (tsc_khz * 1000) / TSC_DIVISOR = tsc_khz * (1000 / TSC_DIVISOR) and thus, the roundoff error even vanishes and the whole operation can be carried out within 32 bits. The powers of two that divide 1000 are 2, 4 and 8. A value of 8 for TSC_DIVISOR still allows for TSC frequencies up to 2^32 / 10^9ns * 8 = 34.4GHz which is way larger than anything to expect in the next years. Thus we also replace the current TSC_DIVISOR value of 32 by 8. Reverse the order of the divison and the multiplication in the calculation of the registered clockevent device's frequency. Signed-off-by: Nicolai Stange Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Acked-by: Thomas Gleixner Cc: Adrian Hunter Cc: Borislav Petkov Cc: Christopher S. Hall Cc: H. Peter Anvin Cc: Hidehiro Kawai Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20160714152255.18295-2-nicstange@gmail.com [ Improved changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ac8d8ad8b009..a315dc404756 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -313,7 +313,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 -#define TSC_DIVISOR 32 +#define TSC_DIVISOR 8 /* * This function sets up the local APIC timer, with a timeout of @@ -565,7 +565,7 @@ static void setup_APIC_timer(void) CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, + tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); } else clockevents_register_device(levt); -- cgit v1.2.2 From 6731b0d611a1274f9e785fa0189ac2aeeabd0591 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 14 Jul 2016 17:22:55 +0200 Subject: x86/timers/apic: Inform TSC deadline clockevent device about recalibration This patch eliminates a source of imprecise APIC timer interrupts, which imprecision may result in double interrupts or even late interrupts. The TSC deadline clockevent devices' configuration and registration happens before the TSC frequency calibration is refined in tsc_refine_calibration_work(). This results in the TSC clocksource and the TSC deadline clockevent devices being configured with slightly different frequencies: the former gets the refined one and the latter are configured with the inaccurate frequency detected earlier by means of the "Fast TSC calibration using PIT". Within the APIC code, introduce the notifier function lapic_update_tsc_freq() which reconfigures all per-CPU TSC deadline clockevent devices with the current tsc_khz. Call it from the TSC code after TSC calibration refinement has happened. Signed-off-by: Nicolai Stange Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Adrian Hunter Cc: Borislav Petkov Cc: Christopher S. Hall Cc: H. Peter Anvin Cc: Hidehiro Kawai Cc: Len Brown Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20160714152255.18295-3-nicstange@gmail.com [ Pushed #ifdef CONFIG_X86_LOCAL_APIC into header, improved changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++ arch/x86/kernel/tsc.c | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5befd4945f2..124357773ffa 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -135,6 +135,7 @@ extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); +extern void lapic_update_tsc_freq(void); extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop +static inline void lapic_update_tsc_freq(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a315dc404756..0fd3d659f13c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -571,6 +571,30 @@ static void setup_APIC_timer(void) clockevents_register_device(levt); } +/* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + /* * In this functions we calibrate APIC bus clocks to the external timer. * diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a804b5ab32d0..8fb4b6abac0e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -22,6 +22,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1249,6 +1250,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; -- cgit v1.2.2 From 469f00231278da68062a809306df0bac95a27507 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 15 Jul 2016 11:42:43 +0200 Subject: x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text Dmitry Vyukov has reported unexpected KASAN stackdepot growth: https://github.com/google/kasan/issues/36 ... which is caused by the APIC handlers not being present in .irqentry.text: When building with CONFIG_FUNCTION_GRAPH_TRACER=y or CONFIG_KASAN=y, put the APIC interrupt handlers into the .irqentry.text section. This is needed because both KASAN and function graph tracer use __irqentry_text_start and __irqentry_text_end to determine whether a function is an IRQ entry point. Reported-by: Dmitry Vyukov Signed-off-by: Alexander Potapenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: aryabinin@virtuozzo.com Cc: kasan-dev@googlegroups.com Cc: kcc@google.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1468575763-144889-1-git-send-email-glider@google.com [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b846875aeea6..9f85827db24e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -601,9 +601,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) .endm #endif +/* Make sure APIC interrupt handlers end up in the irqentry section: */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +# define POP_SECTION_IRQENTRY .popsection +#else +# define PUSH_SECTION_IRQENTRY +# define POP_SECTION_IRQENTRY +#endif + .macro apicinterrupt num sym do_sym +PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym trace_apicinterrupt \num \sym +POP_SECTION_IRQENTRY .endm #ifdef CONFIG_SMP -- cgit v1.2.2 From 22cc1ca3c5469cf17e149be232817b9223afa5e4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Aug 2016 21:54:53 +0200 Subject: x86/hpet: Fix /dev/rtc breakage caused by RTC cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ville Syrjälä reports "The first time I run hwclock after rebooting I get this: open("/dev/rtc", O_RDONLY) = 3 ioctl(3, PHN_SET_REGS or RTC_UIE_ON, 0) = 0 select(4, [3], NULL, NULL, {10, 0}) = 0 (Timeout) ioctl(3, PHN_NOT_OH or RTC_UIE_OFF, 0) = 0 close(3) = 0 On all subsequent runs I get this: open("/dev/rtc", O_RDONLY) = 3 ioctl(3, PHN_SET_REGS or RTC_UIE_ON, 0) = -1 EINVAL (Invalid argument) ioctl(3, RTC_RD_TIME, 0x7ffd76b3ae70) = -1 EINVAL (Invalid argument) close(3) = 0" This was caused by a stupid typo in a patch that should have been a simple rename to move around contents of a header file, but accidentally wrote zeroes into the rtc rather than reading from it: 463a86304cae ("char/genrtc: x86: remove remnants of asm/rtc.h") Reported-by: Ville Syrjälä Tested-by: Jarkko Nikula Tested-by: Ville Syrjälä Signed-off-by: Arnd Bergmann Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: Borislav Petkov Cc: Geert Uytterhoeven Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: rtc-linux@googlegroups.com Fixes: 463a86304cae ("char/genrtc: x86: remove remnants of asm/rtc.h") Link: http://lkml.kernel.org/r/20160809195528.1604312-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ed16e58658a4..c6dfd801df97 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1242,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_set_time(&curr_time); + mc146818_get_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { -- cgit v1.2.2 From c7d2361f7524f365c1ae42f47880e3fa9efb2c2a Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 9 Aug 2016 10:11:04 -0700 Subject: x86/mm/KASLR: Fix physical memory calculation on KASLR memory randomization Initialize KASLR memory randomization after max_pfn is initialized. Also ensure the size is rounded up. It could create problems on machines with more than 1Tb of memory on certain random addresses. Signed-off-by: Thomas Garnier Cc: Aleksey Makarov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Fabian Frederick Cc: H. Peter Anvin Cc: Joerg Roedel Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions") Link: http://lkml.kernel.org/r/1470762665-88032-1-git-send-email-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++++-- arch/x86/mm/kaslr.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 991b77986d57..95cf31c9f4ec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); - kernel_randomize_memory(); - iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); @@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 26dccd6c0df1..ec8654f117d8 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void) * add padding if needed (especially for memory hotplug support). */ BUG_ON(kaslr_regions[0].base != &page_offset_base); - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ -- cgit v1.2.2 From fb754f958f8e46202c1efd7f66d5b3db1208117d Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 9 Aug 2016 10:11:05 -0700 Subject: x86/mm/KASLR: Increase BRK pages for KASLR memory randomization Default implementation expects 6 pages maximum are needed for low page allocations. If KASLR memory randomization is enabled, the worse case of e820 layout would require 12 pages (no large pages). It is due to the PUD level randomization and the variable e820 memory layout. This bug was found while doing extensive testing of KASLR memory randomization on different type of hardware. Signed-off-by: Thomas Garnier Cc: Aleksey Makarov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Fabian Frederick Cc: H. Peter Anvin Cc: Joerg Roedel Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions") Link: http://lkml.kernel.org/r/1470762665-88032-2-git-send-email-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 620928903be3..d28a2d741f9e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) +/* + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. + * With KASLR memory randomization, depending on the machine e820 memory + * and the PUD alignment. We may need twice more pages when KASLR memory + * randomization is enabled. + */ +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT 6 +#else +#define INIT_PGD_PAGE_COUNT 12 +#endif +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { -- cgit v1.2.2 From 164af597ce945751e2dcd53d0a86e84203a6d117 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 9 Aug 2016 22:43:46 +1000 Subject: powerpc/Makefile: Use cflags-y/aflags-y for setting endian options When we introduced the little endian support, we added the endian flags to CC directly using override. I don't know the history of why we did that, I suspect no one does. Although this mostly works, it has one bug, which is that CROSS32CC doesn't get -mbig-endian. That means when the compiler is little endian by default and the user is building big endian, vdso32 is incorrectly compiled as little endian and the kernel fails to build. Instead we can add the endian flags to cflags-y/aflags-y, and then append those to KBUILD_CFLAGS/KBUILD_AFLAGS. This has the advantage of being 1) less ugly, 2) the documented way of adding flags in the arch Makefile and 3) it fixes building vdso32 with a LE toolchain. Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index ca254546cd05..1934707bf321 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -66,29 +66,28 @@ endif UTS_MACHINE := $(OLDARCH) ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) -override CC += -mlittle-endian -ifneq ($(cc-name),clang) -override CC += -mno-strict-align -endif -override AS += -mlittle-endian override LD += -EL -override CROSS32CC += -mlittle-endian override CROSS32AS += -mlittle-endian LDEMULATION := lppc GNUTARGET := powerpcle MULTIPLEWORD := -mno-multiple KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-save-toc-indirect) else -ifeq ($(call cc-option-yn,-mbig-endian),y) -override CC += -mbig-endian -override AS += -mbig-endian -endif override LD += -EB LDEMULATION := ppc GNUTARGET := powerpc MULTIPLEWORD := -mmultiple endif +cflags-$(CONFIG_CPU_BIG_ENDIAN) += $(call cc-option,-mbig-endian) +cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian +ifneq ($(cc-name),clang) + cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mno-strict-align +endif + +aflags-$(CONFIG_CPU_BIG_ENDIAN) += $(call cc-option,-mbig-endian) +aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian + ifeq ($(HAS_BIARCH),y) override AS += -a$(CONFIG_WORD_SIZE) override LD += -m elf$(CONFIG_WORD_SIZE)$(LDEMULATION) @@ -232,6 +231,9 @@ cpu-as-$(CONFIG_E200) += -Wa,-me200 KBUILD_AFLAGS += $(cpu-as-y) KBUILD_CFLAGS += $(cpu-as-y) +KBUILD_AFLAGS += $(aflags-y) +KBUILD_CFLAGS += $(cflags-y) + head-y := arch/powerpc/kernel/head_$(CONFIG_WORD_SIZE).o head-$(CONFIG_8xx) := arch/powerpc/kernel/head_8xx.o head-$(CONFIG_40x) := arch/powerpc/kernel/head_40x.o -- cgit v1.2.2 From b9a4a0d02c5b8d9a1397c11d741d2a1a56381178 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 9 Aug 2016 22:17:29 +1000 Subject: powerpc/vdso: Fix build rules to rebuild vdsos correctly When using if_changed, we need to add FORCE as a dependency (see Documentation/kbuild/makefiles.txt) otherwise we don't get command line change checking amongst other things. This has resulted in vdsos not being rebuilt when switching between big and little endian. The vdso64/32ld commands have to be changed around to avoid pulling FORCE into the linker command line (code copied from x86). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vdso32/Makefile | 6 +++--- arch/powerpc/kernel/vdso64/Makefile | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index cbabd143acae..78a7449bf489 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -30,7 +30,7 @@ CPPFLAGS_vdso32.lds += -P -C -Upowerpc $(obj)/vdso32_wrapper.o : $(obj)/vdso32.so # link rule for the .so file, .lds has to be first -$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE $(call if_changed,vdso32ld) # strip rule for the .so file @@ -39,12 +39,12 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # assembly rules for the .S files -$(obj-vdso32): %.o: %.S +$(obj-vdso32): %.o: %.S FORCE $(call if_changed_dep,vdso32as) # actual build commands quiet_cmd_vdso32ld = VDSO32L $@ - cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ + cmd_vdso32ld = $(CROSS32CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) quiet_cmd_vdso32as = VDSO32A $@ cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $< diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index c710802b8fb6..366ae09b14c1 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -23,7 +23,7 @@ CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so # link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE $(call if_changed,vdso64ld) # strip rule for the .so file @@ -32,12 +32,12 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) # assembly rules for the .S files -$(obj-vdso64): %.o: %.S +$(obj-vdso64): %.o: %.S FORCE $(call if_changed_dep,vdso64as) # actual build commands quiet_cmd_vdso64ld = VDSO64L $@ - cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ + cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) quiet_cmd_vdso64as = VDSO64A $@ cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< -- cgit v1.2.2 From 5cf0791da5c162ebc14b01eb01631cfa7ed4fa6e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 5 Aug 2016 15:37:39 +0200 Subject: x86/mm: Disable preemption during CR3 read+write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a subtle preemption race on UP kernels: Usually current->mm (and therefore mm->pgd) stays the same during the lifetime of a task so it does not matter if a task gets preempted during the read and write of the CR3. But then, there is this scenario on x86-UP: TaskA is in do_exit() and exit_mm() sets current->mm = NULL followed by: -> mmput() -> exit_mmap() -> tlb_finish_mmu() -> tlb_flush_mmu() -> tlb_flush_mmu_tlbonly() -> tlb_flush() -> flush_tlb_mm_range() -> __flush_tlb_up() -> __flush_tlb() -> __native_flush_tlb() At this point current->mm is NULL but current->active_mm still points to the "old" mm. Let's preempt taskA _after_ native_read_cr3() by taskB. TaskB has its own mm so CR3 has changed. Now preempt back to taskA. TaskA has no ->mm set so it borrows taskB's mm and so CR3 remains unchanged. Once taskA gets active it continues where it was interrupted and that means it writes its old CR3 value back. Everything is fine because userland won't need its memory anymore. Now the fun part: Let's preempt taskA one more time and get back to taskB. This time switch_mm() won't do a thing because oldmm (->active_mm) is the same as mm (as per context_switch()). So we remain with a bad CR3 / PGD and return to userland. The next thing that happens is handle_mm_fault() with an address for the execution of its code in userland. handle_mm_fault() realizes that it has a PTE with proper rights so it returns doing nothing. But the CPU looks at the wrong PGD and insists that something is wrong and faults again. And again. And one more time… This pagefault circle continues until the scheduler gets tired of it and puts another task on the CPU. It gets little difficult if the task is a RT task with a high priority. The system will either freeze or it gets fixed by the software watchdog thread which usually runs at RT-max prio. But waiting for the watchdog will increase the latency of the RT task which is no good. Fix this by disabling preemption across the critical code section. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1470404259-26290-1-git-send-email-bigeasy@linutronix.de [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4e5be94e079a..6fa85944af83 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) -- cgit v1.2.2 From 3e035305875cfa8a58c1ca573d0cfa6a7f201f27 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Aug 2016 19:14:29 +0200 Subject: x86/entry: Clarify the RF saving/restoring situation with SYSCALL/SYSRET Clarify why exactly RF cannot be restored properly by SYSRET to avoid confusion. No functionality change. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160803171429.GA2590@nazgul.tnic Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9f85827db24e..d172c619c449 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,11 +288,15 @@ return_from_SYSCALL_64: jne opportunistic_sysret_failed /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: * * movq $stuck_here, %rcx * pushfq -- cgit v1.2.2 From 054f621fd5b1c7245710f5d3935c94ce6ae4b3b7 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:50 -0500 Subject: x86/platform/UV: Fix problem with UV4 Socket IDs not being contiguous The UV4 Socket IDs are not guaranteed to equate to Node values which can cause the GAM (Global Addressable Memory) table lookups to fail. Fix this by using an independent index into the GAM table instead of the Socket ID to reference the base address. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.048755337@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 09b59adaea3f..d9187336aa2e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -325,7 +325,7 @@ static __init void build_uv_gr_table(void) struct uv_gam_range_entry *gre = uv_gre_table; struct uv_gam_range_s *grt; unsigned long last_limit = 0, ram_limit = 0; - int bytes, i, sid, lsid = -1; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; if (!gre) return; @@ -356,11 +356,12 @@ static __init void build_uv_gr_table(void) } sid = gre->sockid - _min_socket; if (lsid < sid) { /* new range */ - grt = &_gr_table[sid]; - grt->base = lsid; + grt = &_gr_table[indx]; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid = sid; + lindx = indx++; continue; } if (lsid == sid && !ram_limit) { /* update range */ @@ -371,7 +372,7 @@ static __init void build_uv_gr_table(void) } if (!ram_limit) { /* non-contiguous ram range */ grt++; - grt->base = sid - 1; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; -- cgit v1.2.2 From e363d24c2b997c421476c6aa00547edadf678efe Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:51 -0500 Subject: x86/platform/UV: Fix bug with iounmap() of the UV4 EFI System Table causing a crash Save the uv_systab::size field before doing the iounmap() of the struct pointer, to avoid a NULL dereference crash. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.250424783@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/bios_uv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 66b2166ea4a1..4e9fd1378aec 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -199,12 +199,14 @@ void uv_bios_init(void) return; } + /* Starting with UV4 the UV systab size is variable */ if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { + int size = uv_systab->size; + iounmap(uv_systab); - uv_systab = ioremap(efi.uv_systab, uv_systab->size); + uv_systab = ioremap(efi.uv_systab, size); if (!uv_systab) { - pr_err("UV: UVsystab: ioremap(%d) failed!\n", - uv_systab->size); + pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return; } } -- cgit v1.2.2 From 22ac2bca92f2d92c6495248d65ff648182df428d Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:52 -0500 Subject: x86/platform/UV: Fix problem with UV4 BIOS providing incorrect PXM values There are some circumstances where the UV4 BIOS cannot provide the correct Proximity Node values to associate with specific Sockets and Physical Nodes. The decision was made to remove these values from BIOS and for the kernel to get these values from the standard ACPI tables. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.414210079@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/bios.h | 5 +++-- arch/x86/kernel/apic/x2apic_uv_x.c | 28 ++++++++++------------------ 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index c852590254d5..e652a7cc6186 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -79,7 +79,7 @@ struct uv_gam_range_entry { u16 nasid; /* HNasid */ u16 sockid; /* Socket ID, high bits of APIC ID */ u16 pnode; /* Index to MMR and GRU spaces */ - u32 pxm; /* ACPI proximity domain number */ + u32 unused2; u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ }; @@ -88,7 +88,8 @@ struct uv_gam_range_entry { #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ #define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ #define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ -#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 +#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */ +#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3 #define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ #define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d9187336aa2e..6aa0545879bb 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -1156,19 +1156,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", - "SID", "PN", "PXM"); + "SID", "PN"); } pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, ((unsigned long)(gre->limit - lgre)) >> (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ - gre->type, gre->nasid, gre->sockid, - gre->pnode, gre->pxm); + gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; if (sock_min > gre->sockid) @@ -1287,7 +1286,7 @@ static void __init build_socket_tables(void) _pnode_to_socket[i] = SOCK_EMPTY; /* fill in pnode/node/addr conversion list values */ - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; @@ -1295,20 +1294,18 @@ static void __init build_socket_tables(void) if (_socket_to_pnode[i] != SOCK_EMPTY) continue; /* duplicate */ _socket_to_pnode[i] = gre->pnode; - _socket_to_node[i] = gre->pxm; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], - _socket_to_node[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* check socket -> node values */ + /* Set socket -> node values */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1319,14 +1316,9 @@ static void __init build_socket_tables(void) lnid = nid; apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - i = sockid - minsock; - - if (nid != _socket_to_node[i]) { - pr_warn( - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", - i, sockid, gre->type, _socket_to_node[i], nid); - _socket_to_node[i] = nid; - } + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); } /* Setup physical blade to pnode translation from GAM Range Table */ -- cgit v1.2.2 From 5a52e8f822374bebc702bb2688ed8b5515bbb55b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:53 -0500 Subject: x86/platform/UV: Fix kernel panic running RHEL kdump kernel on UV systems The latest UV kernel support panics when RHEL7 kexec's the kdump kernel to make a dumpfile. This patch fixes the problem by turning off all UV support if NUMA is off. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.577755634@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 6aa0545879bb..cb0673c1e940 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + /* Setup early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; -- cgit v1.2.2 From 5e44258d168b2bdee51d9e2e1f1f4726ff9775cd Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Sun, 31 Jul 2016 23:24:50 -0400 Subject: x86/build: Reduce the W=1 warnings noise when compiling x86 syscall tables Building an X86_64 kernel with W=1 throws a total of 9,948 lines of warnings of this form for both 32-bit and 64-bit syscall tables. Given that the entire rest of the build for my config only generates 8,375 lines of output, this is a big reduction in the warnings generated. The warnings follow this pattern: ./arch/x86/include/generated/asm/syscalls_32.h:885:21: warning: initialized field overwritten [-Woverride-init] __SYSCALL_I386(379, compat_sys_pwritev2, ) ^ arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386' #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, ^~~ ./arch/x86/include/generated/asm/syscalls_32.h:885:21: note: (near initialization for 'ia32_sys_call_table[379]') __SYSCALL_I386(379, compat_sys_pwritev2, ) ^ arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386' #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, Since we intentionally build the syscall tables this way, ignore that one warning in the two files. Signed-off-by: Valdis Kletnieks Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7464.1470021890@turing-police.cc.vt.edu Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fe91c25092da..77f28ce9c646 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -5,6 +5,8 @@ OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y +CFLAGS_syscall_64.o += -Wno-override-init +CFLAGS_syscall_32.o += -Wno-override-init obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o -- cgit v1.2.2 From b79daf85899215d5ede3641806db2e2a77b776b4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 27 Jul 2016 16:20:40 -0700 Subject: x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation The Memory Protection Keys "rights register" (PKRU) is XSAVE-managed, and is saved/restored along with the FPU state. When kernel code accesses FPU regsisters, it does a delicate dance with preempt. Otherwise, the context switching code can get confused as to whether the most up-to-date state is in the registers themselves or in the XSAVE buffer. But, PKRU is not a normal FPU register. Using it does not generate the normal device-not-available (#NM) exceptions which means we can not manage it lazily, and the kernel completley disallows using lazy mode when it is enabled. The dance with preempt *only* occurs when managing the FPU lazily. Since we never manage PKRU lazily, we do not have to do the dance with preempt; we can access it directly. Doing it this way saves a ton of complicated code (and is faster too). Further, the XSAVES reenabling failed to patch a bit of code in fpu__xfeature_set_state() the checked for compacted buffers. That check caused fpu__xfeature_set_state() to silently refuse to work when the kernel is using compacted XSAVE buffers. This broke execute-only and future pkey_mprotect() support when using compact XSAVE buffers. But, removing fpu__xfeature_set_state() gets rid of this issue, in addition to the nice cleanup and speedup. This fixes the same thing as a fix that Sai posted: https://lkml.org/lkml/2016/7/25/637 The fix that he posted is a much more obviously correct, but I think we should just do this instead. Reported-by: Sai Praneeth Prakhya Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi Shankar Cc: Thomas Gleixner Cc: Yu-Cheng Yu Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 138 ++++++------------------------------------- 1 file changed, 17 insertions(+), 121 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 680049aa4593..01567aa87503 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } - -/* - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want - * to take out of its "init state". This will ensure that an - * XRSTOR actually restores the state. - */ -static void fpu__xfeature_set_non_init(struct xregs_state *xsave, - int xstate_feature_mask) -{ - xsave->header.xfeatures |= xstate_feature_mask; -} - -/* - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) - * @xsave_state_ptr: a pointer to a copy of the state that you would - * like written in to the current task's FPU xsave state. This pointer - * must not be located in the current tasks's xsave area. - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. - */ -static void fpu__xfeature_set_state(int xstate_feature_mask, - void *xstate_feature_src, size_t len) -{ - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; - struct fpu *fpu = ¤t->thread.fpu; - void *dst; - - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); - return; - } - - /* - * Tell the FPU code that we need the FPU state to be in - * 'fpu' (not in the registers), and that we need it to - * be stable while we write to it. - */ - fpu__current_fpstate_write_begin(); - - /* - * This method *WILL* *NOT* work for compact-format - * buffers. If the 'xstate_feature_mask' is unset in - * xcomp_bv then we may need to move other feature state - * "up" in the buffer. - */ - if (xsave->header.xcomp_bv & xstate_feature_mask) { - WARN_ON_ONCE(1); - goto out; - } - - /* find the location in the xsave buffer of the desired state */ - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); - - /* - * Make sure that the pointer being passed in did not - * come from the xsave buffer itself. - */ - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); - - /* put the caller-provided data in the location */ - memcpy(dst, xstate_feature_src, len); - - /* - * Mark the xfeature so that the CPU knows there is state - * in the buffer now. - */ - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); -out: - /* - * We are done writing to the 'fpu'. Reenable preeption - * and (possibly) move the fpstate back in to the fpregs. - */ - fpu__current_fpstate_write_end(); -} - #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* - * This will go out and modify the XSAVE buffer so that PKRU is - * set to a particular state for access to 'pkey'. - * - * PKRU state does affect kernel access to user memory. We do - * not modfiy PKRU *itself* here, only the XSAVE state that will - * be restored in to PKRU when we return back to userspace. + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct pkru_state *old_pkru_state; - struct pkru_state new_pkru_state; + u32 old_pkru; int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); u32 new_pkru_bits = 0; @@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * For most XSAVE components, this would be an arduous task: + * brining fpstate up to date with fpregs, updating fpstate, + * then re-populating fpregs. But, for components that are + * never lazily managed, we can just access the fpregs + * directly. PKRU is never managed lazily, so we can just + * manipulate it directly. Make sure it stays that way. + */ + WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) @@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, /* Shift the bits in to the correct place in PKRU for pkey: */ new_pkru_bits <<= pkey_shift; - /* Locate old copy of the state in the xsave buffer: */ - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); - - /* - * When state is not in the buffer, it is in the init - * state, set it manually. Otherwise, copy out the old - * state. - */ - if (!old_pkru_state) - new_pkru_state.pkru = 0; - else - new_pkru_state.pkru = old_pkru_state->pkru; - - /* Mask off any old bits in place: */ - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - - /* Set the newly-requested bits: */ - new_pkru_state.pkru |= new_pkru_bits; - - /* - * We could theoretically live without zeroing pkru.pad. - * The current XSAVE feature state definition says that - * only bytes 0->3 are used. But we do not want to - * chance leaking kernel stack out to userspace in case a - * memcpy() of the whole xsave buffer was done. - * - * They're in the same cacheline anyway. - */ - new_pkru_state.pad = 0; + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); return 0; } -- cgit v1.2.2 From 18b43e89d295cc65151c505c643c98fb2c320e59 Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Thu, 4 Aug 2016 17:56:53 -0700 Subject: ARC: Call trace_hardirqs_on() before enabling irqs trace_hardirqs_on_caller() in lockdep.c expects to be called before, not after interrupts are actually enabled. The following comment in kernel/locking/lockdep.c substantiates this claim: " /* * We're enabling irqs and according to our state above irqs weren't * already enabled, yet we find the hardware thinks they are in fact * enabled.. someone messed up their IRQ state tracing. */ " An example can be found in include/linux/irqflags.h: do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) Without this change, we hit the following DEBUG_LOCKS_WARN_ON. [ 7.760000] ------------[ cut here ]------------ [ 7.760000] WARNING: CPU: 0 PID: 1 at kernel/locking/lockdep.c:2711 resume_user_mode_begin+0x48/0xf0 [ 7.770000] DEBUG_LOCKS_WARN_ON(!irqs_disabled()) [ 7.780000] Modules linked in: [ 7.780000] CPU: 0 PID: 1 Comm: init Not tainted 4.7.0-00003-gc668bb9-dirty #366 [ 7.790000] [ 7.790000] Stack Trace: [ 7.790000] arc_unwind_core.constprop.1+0xa4/0x118 [ 7.800000] warn_slowpath_fmt+0x72/0x158 [ 7.800000] resume_user_mode_begin+0x48/0xf0 [ 7.810000] ---[ end trace 6f6a7a8fae20d2f0 ]--- Signed-off-by: Daniel Mentz Cc: Signed-off-by: Vineet Gupta --- arch/arc/include/asm/irqflags-compact.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arc/include/asm/irqflags-compact.h b/arch/arc/include/asm/irqflags-compact.h index c1d36458bfb7..4c6eed80cd8b 100644 --- a/arch/arc/include/asm/irqflags-compact.h +++ b/arch/arc/include/asm/irqflags-compact.h @@ -188,10 +188,10 @@ static inline int arch_irqs_disabled(void) .endm .macro IRQ_ENABLE scratch + TRACE_ASM_IRQ_ENABLE lr \scratch, [status32] or \scratch, \scratch, (STATUS_E1_MASK | STATUS_E2_MASK) flag \scratch - TRACE_ASM_IRQ_ENABLE .endm #endif /* __ASSEMBLY__ */ -- cgit v1.2.2 From 45c3b08a117e2232fc8d7b9e849ead36386f4f96 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 13 Jun 2016 16:38:27 +0200 Subject: ARC: Elide redundant setup of DMA callbacks For resources shared by all cores such as SLC and IOC, only the master core needs to do any setups / enabling / disabling etc. Cc: Signed-off-by: Vineet Gupta --- arch/arc/mm/cache.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 5a294b2c3cb3..0b10efe3a6a7 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -921,6 +921,15 @@ void arc_cache_init(void) printk(arc_cache_mumbojumbo(0, str, sizeof(str))); + /* + * Only master CPU needs to execute rest of function: + * - Assume SMP so all cores will have same cache config so + * any geomtry checks will be same for all + * - IOC setup / dma callbacks only need to be setup once + */ + if (cpu) + return; + if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE)) { struct cpuinfo_arc_cache *ic = &cpuinfo_arc700[cpu].icache; -- cgit v1.2.2 From 7de249964f5578e67b99699c5f0b405738d820a2 Mon Sep 17 00:00:00 2001 From: Dave Weinstein Date: Thu, 28 Jul 2016 11:55:41 -0700 Subject: arm: oabi compat: add missing access checks Add access checks to sys_oabi_epoll_wait() and sys_oabi_semtimedop(). This fixes CVE-2016-3857, a local privilege escalation under CONFIG_OABI_COMPAT. Cc: stable@vger.kernel.org Reported-by: Chiachih Wu Reviewed-by: Kees Cook Reviewed-by: Nicolas Pitre Signed-off-by: Dave Weinstein Signed-off-by: Linus Torvalds --- arch/arm/kernel/sys_oabi-compat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 087acb569b63..5f221acd21ae 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -279,8 +279,12 @@ asmlinkage long sys_oabi_epoll_wait(int epfd, mm_segment_t fs; long ret, err, i; - if (maxevents <= 0 || maxevents > (INT_MAX/sizeof(struct epoll_event))) + if (maxevents <= 0 || + maxevents > (INT_MAX/sizeof(*kbuf)) || + maxevents > (INT_MAX/sizeof(*events))) return -EINVAL; + if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents)) + return -EFAULT; kbuf = kmalloc(sizeof(*kbuf) * maxevents, GFP_KERNEL); if (!kbuf) return -ENOMEM; @@ -317,6 +321,8 @@ asmlinkage long sys_oabi_semtimedop(int semid, if (nsops < 1 || nsops > SEMOPM) return -EINVAL; + if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops)) + return -EFAULT; sops = kmalloc(sizeof(*sops) * nsops, GFP_KERNEL); if (!sops) return -ENOMEM; -- cgit v1.2.2 From 549fba3a615a5dc6219bb41b3ee3951173fcf1e6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Jun 2016 16:21:19 +0200 Subject: ARM: don't include removed directories Three platforms used to have header files in include/mach that are now all gone, but the removed directories are still being included, which leads to -Wmissing-include-dirs warnings. This removes the extra -I flags. Signed-off-by: Arnd Bergmann --- arch/arm/mach-mvebu/Makefile | 3 +-- arch/arm/mach-realview/Makefile | 3 +-- arch/arm/mach-s5pv210/Makefile | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-mvebu/Makefile b/arch/arm/mach-mvebu/Makefile index e53c6cfcab51..6c6497e80a7b 100644 --- a/arch/arm/mach-mvebu/Makefile +++ b/arch/arm/mach-mvebu/Makefile @@ -1,5 +1,4 @@ -ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \ - -I$(srctree)/arch/arm/plat-orion/include +ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/arch/arm/plat-orion/include AFLAGS_coherency_ll.o := -Wa,-march=armv7-a CFLAGS_pmsu.o := -march=armv7-a diff --git a/arch/arm/mach-realview/Makefile b/arch/arm/mach-realview/Makefile index dae8d86ef4cc..404882130956 100644 --- a/arch/arm/mach-realview/Makefile +++ b/arch/arm/mach-realview/Makefile @@ -1,8 +1,7 @@ # # Makefile for the linux kernel. # -ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \ - -I$(srctree)/arch/arm/plat-versatile/include +ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/arch/arm/plat-versatile/include obj-y := core.o obj-$(CONFIG_REALVIEW_DT) += realview-dt.o diff --git a/arch/arm/mach-s5pv210/Makefile b/arch/arm/mach-s5pv210/Makefile index 72b9e9671507..fa7fb716e388 100644 --- a/arch/arm/mach-s5pv210/Makefile +++ b/arch/arm/mach-s5pv210/Makefile @@ -5,7 +5,7 @@ # # Licensed under GPLv2 -ccflags-$(CONFIG_ARCH_MULTIPLATFORM) += -I$(srctree)/$(src)/include -I$(srctree)/arch/arm/plat-samsung/include +ccflags-$(CONFIG_ARCH_MULTIPLATFORM) += -I$(srctree)/arch/arm/plat-samsung/include # Core -- cgit v1.2.2 From de8a06f674a6f9d781e5087e9217319c7b831821 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 9 Jun 2016 09:50:28 +0200 Subject: ARM: hide mach-*/ include for ARM_SINGLE_ARMV7M The machine specific header files are exported for traditional platforms, but not for the ones that use ARCH_MULTIPLATFORM, as they could conflict with one another. In case of ARM_SINGLE_ARMV7M, we end up also exporting them, but that appears to be a mistake, and we should treat it the same way as ARCH_MULTIPLATFORM here. 'make W=1' warns about this because it passes -Wmissing-includes to gcc and the directories are not actually present. Signed-off-by: Arnd Bergmann --- arch/arm/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 56ea5c60b318..61f6ccc19cfa 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -260,12 +260,14 @@ machdirs := $(patsubst %,arch/arm/mach-%/,$(machine-y)) platdirs := $(patsubst %,arch/arm/plat-%/,$(sort $(plat-y))) ifneq ($(CONFIG_ARCH_MULTIPLATFORM),y) +ifneq ($(CONFIG_ARM_SINGLE_ARMV7M),y) ifeq ($(KBUILD_SRC),) KBUILD_CPPFLAGS += $(patsubst %,-I%include,$(machdirs) $(platdirs)) else KBUILD_CPPFLAGS += $(patsubst %,-I$(srctree)/%include,$(machdirs) $(platdirs)) endif endif +endif export TEXT_OFFSET GZFLAGS MMUEXT -- cgit v1.2.2 From af9d238c6a1d5c131104bb011cc4c55a6207b9a2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Jul 2016 14:16:24 +0200 Subject: ARM: oxnas: select reset controller framework For unknown reasons, we have to enable three symbols for a platform to use a reset controller driver, otherwise we get a Kconfig warning: warning: (MACH_OX810SE) selects RESET_OXNAS which has unmet direct dependencies (RESET_CONTROLLER) This selects the other two symbols for oxnas. Signed-off-by: Arnd Bergmann Acked-by: Neil Armstrong --- arch/arm/mach-oxnas/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/mach-oxnas/Kconfig b/arch/arm/mach-oxnas/Kconfig index 567496bd250a..29100beb2e7f 100644 --- a/arch/arm/mach-oxnas/Kconfig +++ b/arch/arm/mach-oxnas/Kconfig @@ -11,11 +11,13 @@ if ARCH_OXNAS config MACH_OX810SE bool "Support OX810SE Based Products" + select ARCH_HAS_RESET_CONTROLLER select COMMON_CLK_OXNAS select CPU_ARM926T select MFD_SYSCON select OXNAS_RPS_TIMER select PINCTRL_OXNAS + select RESET_CONTROLLER select RESET_OXNAS select VERSATILE_FPGA_IRQ help -- cgit v1.2.2 From 83e484fcbe44e91fe8efb41b33ed3434806eeaac Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 10 Aug 2016 11:38:24 +0200 Subject: ARM: dts: add syscon compatible string for CP syscon This syscon needs to be looked up by flash protection, CLCD display output settings and other consumers. Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/integratorcp.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/integratorcp.dts b/arch/arm/boot/dts/integratorcp.dts index d43f15b4f79a..79430fbfec3b 100644 --- a/arch/arm/boot/dts/integratorcp.dts +++ b/arch/arm/boot/dts/integratorcp.dts @@ -94,7 +94,7 @@ }; syscon { - compatible = "arm,integrator-cp-syscon"; + compatible = "arm,integrator-cp-syscon", "syscon"; reg = <0xcb000000 0x100>; }; -- cgit v1.2.2 From f2b54191f7bef2925db39eceb84e7e19ff6d8f56 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 10 Aug 2016 11:38:12 +0200 Subject: ARM: dts: add syscon compatible string for AP syscon This syscon needs to be looked up by clocks, flash protection and other consumers. Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/integratorap.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/integratorap.dts b/arch/arm/boot/dts/integratorap.dts index cf06e32ee108..4b34b54e09a1 100644 --- a/arch/arm/boot/dts/integratorap.dts +++ b/arch/arm/boot/dts/integratorap.dts @@ -42,7 +42,7 @@ }; syscon { - compatible = "arm,integrator-ap-syscon"; + compatible = "arm,integrator-ap-syscon", "syscon"; reg = <0x11000000 0x100>; interrupt-parent = <&pic>; /* These are the logical module IRQs */ -- cgit v1.2.2 From b5c86b7496d74f6e454bcab5166efa023e1f0459 Mon Sep 17 00:00:00 2001 From: Ralf Ramsauer Date: Mon, 18 Jul 2016 11:46:48 +0200 Subject: ARM: tegra: fix erroneous address in dts c90bb7b enabled the high speed UARTs of the Jetson TK1. Due to a merge quirk, wrong addresses were introduced. Fix it and use the correct addresses. Thierry let me know, that there is another patch (b5896f67ab3c in linux-next) in preparation which removes all the '0,' prefixes of unit addresses on Tegra124 and is planned to go upstream in 4.8, so this patch will get reverted then. But for the moment, this patch is necessary to fix current misbehaviour. Fixes: c90bb7b9b9 ("ARM: tegra: Add high speed UARTs to Jetson TK1 device tree") Signed-off-by: Ralf Ramsauer Acked-by: Thierry Reding Cc: stable@vger.kernel.org # v4.7 Cc: linux-tegra@vger.kernel.org Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/tegra124-jetson-tk1.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/tegra124-jetson-tk1.dts b/arch/arm/boot/dts/tegra124-jetson-tk1.dts index e52b82449a79..6403e0de540e 100644 --- a/arch/arm/boot/dts/tegra124-jetson-tk1.dts +++ b/arch/arm/boot/dts/tegra124-jetson-tk1.dts @@ -1382,7 +1382,7 @@ * Pin 41: BR_UART1_TXD * Pin 44: BR_UART1_RXD */ - serial@70006000 { + serial@0,70006000 { compatible = "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart"; status = "okay"; }; @@ -1394,7 +1394,7 @@ * Pin 71: UART2_CTS_L * Pin 74: UART2_RTS_L */ - serial@70006040 { + serial@0,70006040 { compatible = "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart"; status = "okay"; }; -- cgit v1.2.2 From a20303725ec31ea0fcf498f1885b1d4245a4ee56 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 10 Aug 2016 14:02:17 +0200 Subject: ARM: dts: realview: Fix PBX-A9 cache description Clearly QEMU is very permissive in how its PL310 model may be set up, but the real hardware turns out to be far more particular about things actually being correct. Fix up the DT description so that the real thing actually boots: - The arm,data-latency and arm,tag-latency properties need 3 cells to be valid, otherwise we end up retaining the default 8-cycle latencies which leads pretty quickly to lockup. - The arm,dirty-latency property is only relevant to L210/L220, so get rid of it. - The cache geometry override also leads to lockup and/or general misbehaviour. Irritatingly, the manual doesn't state the actual PL310 configuration, but based on the boardfile code and poking registers from the Boot Monitor, it would seem to be 8 sets of 16KB ways. With that, we can successfully boot to enjoy the fun of mismatched FPUs... Cc: stable@vger.kernel.org Signed-off-by: Robin Murphy Tested-by: Mark Rutland Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/arm-realview-pbx-a9.dts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/arm-realview-pbx-a9.dts b/arch/arm/boot/dts/arm-realview-pbx-a9.dts index db808f92dd79..90d00b407f85 100644 --- a/arch/arm/boot/dts/arm-realview-pbx-a9.dts +++ b/arch/arm/boot/dts/arm-realview-pbx-a9.dts @@ -70,13 +70,12 @@ * associativity as these may be erroneously set * up by boot loader(s). */ - cache-size = <1048576>; // 1MB - cache-sets = <4096>; + cache-size = <131072>; // 128KB + cache-sets = <512>; cache-line-size = <32>; arm,parity-disable; - arm,tag-latency = <1>; - arm,data-latency = <1 1>; - arm,dirty-latency = <1>; + arm,tag-latency = <1 1 1>; + arm,data-latency = <1 1 1>; }; scu: scu@1f000000 { -- cgit v1.2.2 From 5a3f75a4b2dab54b6b0d645b500583abfc27b260 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 3 Aug 2016 15:29:33 +0100 Subject: arm64: Kconfig: select ALPINE_MSI only if PCI is selected Even when PCI is disabled, ARCH_ALPINE selects ALPINE_MSI triggerring the following config warning: warning: (ARCH_ALPINE) selects ALPINE_MSI which has unmet direct dependencies (PCI) This patch makes selection of ALPINE_MSI conditional on PCI. Cc: Arnd Bergmann Acked-by: Antoine Tenart Signed-off-by: Sudeep Holla Signed-off-by: Arnd Bergmann --- arch/arm64/Kconfig.platforms | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index bb2616b16157..fd3ee3a392be 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -8,7 +8,7 @@ config ARCH_SUNXI config ARCH_ALPINE bool "Annapurna Labs Alpine platform" - select ALPINE_MSI + select ALPINE_MSI if PCI help This enables support for the Annapurna Labs Alpine Soc family. -- cgit v1.2.2 From f9db43bc296833451cbe4b03eb5e1f633ad1f787 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 3 Aug 2016 15:29:34 +0100 Subject: arm64: Kconfig: select HISILICON_IRQ_MBIGEN only if PCI is selected Even when PCI is disabled, ARCH_HISI selects HISILICON_IRQ_MBIGEN triggerring the following config warning: warning: (ARM64 && HISILICON_IRQ_MBIGEN) selects ARM_GIC_V3_ITS which has unmet direct dependencies (PCI && PCI_MSI) This patch makes selection of HISILICON_IRQ_MBIGEN conditional on PCI. Cc: Ma Jun Cc: Arnd Bergmann Signed-off-by: Sudeep Holla Signed-off-by: Arnd Bergmann --- arch/arm64/Kconfig.platforms | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index fd3ee3a392be..be5d824ebdba 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -66,7 +66,7 @@ config ARCH_LG1K config ARCH_HISI bool "Hisilicon SoC Family" select ARM_TIMER_SP804 - select HISILICON_IRQ_MBIGEN + select HISILICON_IRQ_MBIGEN if PCI help This enables support for Hisilicon ARMv8 SoC family -- cgit v1.2.2 From a545de5ce2ef3abc4db0b9331840acf59c8f9efa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 10 Aug 2016 16:27:41 -0700 Subject: revert "ARM: keystone: dts: add psci command definition" Revert commit 51d5d12b8f3d ("ARM: keystone: dts: add psci command definition"), which was inadvertently added twice. Cc: Russell King - ARM Linux Cc: Vitaly Andrianov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/boot/dts/keystone.dtsi | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/keystone.dtsi b/arch/arm/boot/dts/keystone.dtsi index 00cb314d5e4d..e23f46d15c80 100644 --- a/arch/arm/boot/dts/keystone.dtsi +++ b/arch/arm/boot/dts/keystone.dtsi @@ -70,14 +70,6 @@ cpu_on = <0x84000003>; }; - psci { - compatible = "arm,psci"; - method = "smc"; - cpu_suspend = <0x84000001>; - cpu_off = <0x84000002>; - cpu_on = <0x84000003>; - }; - soc { #address-cells = <1>; #size-cells = <1>; -- cgit v1.2.2 From 12beb346710b766b8e74a7a3ac8165835bd68def Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Aug 2016 22:47:59 +0200 Subject: Merge tag 'pxa-fixes-v4.8' of https://github.com/rjarzmik/linux into randconfig-4.8 This is the pxa changes for v4.8 cycle. This is a tiny fix couple to enable changes in includes in gpio API without breaking pxa boards. * tag 'pxa-fixes-v4.8' of https://github.com/rjarzmik/linux: ARM: pxa: add module.h for corgi symbol_get/symbol_put usage ARM: pxa: add module.h for spitz symbol_get/symbol_put usage --- arch/arm/mach-pxa/corgi.c | 1 + arch/arm/mach-pxa/spitz.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c index dc109dc3a622..10bfdb169366 100644 --- a/arch/arm/mach-pxa/corgi.c +++ b/arch/arm/mach-pxa/corgi.c @@ -13,6 +13,7 @@ */ #include +#include /* symbol_get ; symbol_put */ #include #include #include diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 1080580b1343..2c150bfc0cd5 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -13,6 +13,7 @@ */ #include +#include /* symbol_get ; symbol_put */ #include #include #include -- cgit v1.2.2 From 62d16b5a3fca4d186e13215e0d7d2f6d36191796 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sat, 6 Aug 2016 12:20:39 +0200 Subject: x86/mm/kaslr: Fix -Wformat-security warning debug_putstr() is used to output strings without using printf-like formatting but debug_putstr(v) is defined as early_printk(v) in arch/x86/lib/kaslr.c. This makes clang reports the following warning when building with -Wformat-security: arch/x86/lib/kaslr.c:57:15: warning: format string is not a string literal (potentially insecure) [-Wformat-security] debug_putstr(purpose); ^~~~~~~ Fix this by using "%s" in early_printk(). Signed-off-by: Nicolas Iooss Acked-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160806102039.27221-1-nicolas.iooss_linux@m4x.org Signed-off-by: Ingo Molnar --- arch/x86/lib/kaslr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index f7dfeda83e5c..121f59c6ee54 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -19,7 +19,7 @@ #include #include -#define debug_putstr(v) early_printk(v) +#define debug_putstr(v) early_printk("%s", v) #define has_cpuflag(f) boot_cpu_has(f) #define get_boot_seed() kaslr_offset() #endif -- cgit v1.2.2 From ace7fab7a6cdd363a615ec537f2aa94dbc761ee2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Aug 2016 10:23:25 -0700 Subject: x86/mm: Fix swap entry comment and macro A recent patch changed the format of a swap PTE. The comment explaining the format of the swap PTE is wrong about the bits used for the swap type field. Amusingly, the ASCII art and the patch description are correct, but the comment itself is wrong. As I was looking at this, I also noticed that the SWP_OFFSET_FIRST_BIT has an off-by-one error. This does not really hurt anything. It just wasted a bit of space in the PTE, giving us 2^59 bytes of addressable space in our swapfiles instead of 2^60. But, it doesn't match with the comments, and it wastes a bit of space, so fix it. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Fixes: 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum") Link: http://lkml.kernel.org/r/20160810172325.E56AD7DA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 7e8ec7ae10fa..1cc82ece9ac1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 /* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -- cgit v1.2.2 From 82ba4faca1bffad429f15c90c980ffd010366c25 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 11 Aug 2016 15:44:30 +0800 Subject: x86/irq: Do not substract irq_tlb_count from irq_call_count Since commit: 52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR") the TLB remote shootdown is done through call function vector. That commit didn't take care of irq_tlb_count, which a later commit: fd0f5869724f ("x86: Distinguish TLB shootdown interrupts from other functions call interrupts") ... tried to fix. The fix assumes every increase of irq_tlb_count has a corresponding increase of irq_call_count. So the irq_call_count is always bigger than irq_tlb_count and we could substract irq_tlb_count from irq_call_count. Unfortunately this is not true for the smp_call_function_single() case. The IPI is only sent if the target CPU's call_single_queue is empty when adding a csd into it in generic_exec_single. That means if two threads are both adding flush tlb csds to the same CPU's call_single_queue, only one IPI is sent. In other words, the irq_call_count is incremented by 1 but irq_tlb_count is incremented by 2. Over time, irq_tlb_count will be bigger than irq_call_count and the substract will produce a very large irq_call_count value due to overflow. Considering that: 1) it's not worth to send more IPIs for the sake of accurate counting of irq_call_count in generic_exec_single(); 2) it's not easy to tell if the call function interrupt is for TLB shootdown in __smp_call_function_single_interrupt(). Not to exclude TLB shootdown from call function count seems to be the simplest fix and this patch just does that. This bug was found by LKP's cyclic performance regression tracking recently with the vm-scalability test suite. I have bisected to commit: 3dec0ba0be6a ("mm/rmap: share the i_mmap_rwsem") This commit didn't do anything wrong but revealed the irq_call_count problem. IIUC, the commit makes rwc->remap_one in rmap_walk_file concurrent with multiple threads. When remap_one is try_to_unmap_one(), then multiple threads could queue flush TLB to the same CPU but only one IPI will be sent. Since the commit was added in Linux v3.19, the counting problem only shows up from v3.19 onwards. Signed-off-by: Aaron Lu Cc: Alex Shi Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Ying Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tomoki Sekiyama Link: http://lkml.kernel.org/r/20160811074430.GA18163@aaronlu.sh.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 4 ---- arch/x86/kernel/irq.c | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 7178043b0e1d..59405a248fc2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,10 +22,6 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - /* - * irq_tlb_count is double-counted in irq_call_count, so it must be - * subtracted from irq_call_count when displaying irq_call_count - */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 61521dc19c10..9f669fdd2010 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - - irq_stats(j)->irq_tlb_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) -- cgit v1.2.2 From 007b756053386af079ba963a8f5817ac651c7c59 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:13 -0700 Subject: x86/boot: Run reserve_bios_regions() after we initialize the memory map reserve_bios_regions() is a quirk that reserves memory that we might otherwise think is available. There's no need to run it so early, and running it before we have the memory map initialized with its non-quirky inputs makes it hard to make reserve_bios_regions() more intelligent. Move it right after we populate the memblock state. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/59f58618911005c799c6c9979ce6ae4881d907c2.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head32.c | 2 -- arch/x86/kernel/head64.c | 1 - arch/x86/kernel/setup.c | 2 ++ 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2dda0bc4576e..f16c55bfc090 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void) /* Initialize 32bit specific setup functions */ x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - - reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 99d48e7d2974..54a2372f5dbb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95cf31c9f4ec..bf780e0f76ef 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1101,6 +1101,8 @@ void __init setup_arch(char **cmdline_p) efi_find_mirror(); } + reserve_bios_regions(); + /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. -- cgit v1.2.2 From 18bc7bd523e0fc5be8d76bf84bde733a97a8c375 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:14 -0700 Subject: x86/boot: Synchronize trampoline_cr4_features and mmu_cr4_features directly The initialization process for trampoline_cr4_features and mmu_cr4_features was confusing. The intent is for mmu_cr4_features and *trampoline_cr4_features to stay in sync, but trampoline_cr4_features is NULL until setup_real_mode() runs. The old code synchronized *trampoline_cr4_features *twice*, once in setup_real_mode() and once in setup_arch(). It also initialized mmu_cr4_features in setup_real_mode(), which causes the actual value of mmu_cr4_features to potentially depend on when setup_real_mode() is called. With this patch, mmu_cr4_features is initialized directly in setup_arch(), and *trampoline_cr4_features is synchronized to mmu_cr4_features when the trampoline is set up. After this patch, it should be safe to defer setup_real_mode(). Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d48a263f9912389b957dd495a7127b009259ffe0.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 17 ++++++++++------- arch/x86/realmode/init.c | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bf780e0f76ef..95c8c9c7dd6d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1131,6 +1131,16 @@ void __init setup_arch(char **cmdline_p) early_trap_pf_init(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + */ + if (boot_cpu_data.cpuid_level >= 0) + /* A CPU has %cr4 if and only if it has CPUID. */ + mmu_cr4_features = __read_cr4(); + setup_real_mode(); memblock_set_current_limit(get_max_mapped()); @@ -1180,13 +1190,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = __read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } - #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 705e3fffb4a1..c5bdc4e473e7 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -4,6 +4,7 @@ #include #include #include +#include struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -84,7 +85,7 @@ void __init setup_real_mode(void) trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = __read_cr4(); + *trampoline_cr4_features = mmu_cr4_features; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; -- cgit v1.2.2 From d0de0f685db7faf2ae4597a39a59996dd84e18c7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:15 -0700 Subject: x86/boot: Defer setup_real_mode() to early_initcall time There's no need to run setup_real_mode() as early as we run it. Defer it to the same early_initcall that sets up the page permissions for the real mode code. This should be a code size reduction. More importantly, it give us a longer window in which we can allocate the real mode trampoline. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fd62f0da4f79357695e9bf3e365623736b05f119.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/realmode.h | 1 - arch/x86/kernel/setup.c | 2 -- arch/x86/realmode/init.c | 15 ++++++++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 9c6b890d5e7a..8d6777724ba4 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -59,6 +59,5 @@ extern unsigned char secondary_startup_64[]; #endif void reserve_real_mode(void); -void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95c8c9c7dd6d..0fa60f5f5a16 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1141,8 +1141,6 @@ void __init setup_arch(char **cmdline_p) /* A CPU has %cr4 if and only if it has CPUID. */ mmu_cr4_features = __read_cr4(); - setup_real_mode(); - memblock_set_current_limit(get_max_mapped()); /* diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index c5bdc4e473e7..747b71e8f547 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -30,7 +30,7 @@ void __init reserve_real_mode(void) base, (unsigned long long)mem, size); } -void __init setup_real_mode(void) +static void __init setup_real_mode(void) { u16 real_mode_seg; const u32 *rel; @@ -101,7 +101,7 @@ void __init setup_real_mode(void) * need to mark it executable at do_pre_smp_initcalls() at least, * thus run it as a early_initcall(). */ -static int __init set_real_mode_permissions(void) +static void __init set_real_mode_permissions(void) { unsigned char *base = (unsigned char *) real_mode_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); @@ -120,7 +120,16 @@ static int __init set_real_mode_permissions(void) set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); +} + +static int __init init_real_mode(void) +{ + if (!real_mode_header) + panic("Real mode trampoline was not allocated"); + + setup_real_mode(); + set_real_mode_permissions(); return 0; } -early_initcall(set_real_mode_permissions); +early_initcall(init_real_mode); -- cgit v1.2.2 From 5ff3e2c3c3eebe13967d81ad1f23b9468fefea81 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:16 -0700 Subject: x86/boot: Rework reserve_real_mode() to allow multiple tries If reserve_real_mode() fails, panicing immediately means we're doomed. Make it safe to try more than once to allocate the trampoline: - Degrade a failure from panic() to pr_info(). (If we make it to setup_real_mode() without reserving the trampoline, we'll panic them.) - Factor out helpers so that platform code can supply a specific address to try. - Warn if reserve_real_mode() is called after we're done with the memblock allocator. If that were to happen, we would behave unpredictably. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/876e383038f3e9971aa72fd20a4f5da05f9d193d.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/realmode.h | 9 +++++++++ arch/x86/realmode/init.c | 29 +++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 8d6777724ba4..b2988c0ed829 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,6 +58,15 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif +static inline size_t real_mode_size_needed(void) +{ + if (real_mode_header) + return 0; /* already allocated. */ + + return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE); +} + +void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 747b71e8f547..5db706f14111 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -12,22 +13,34 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void __init set_real_mode_mem(phys_addr_t mem, size_t size) +{ + void *base = __va(mem); + + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); +} + void __init reserve_real_mode(void) { phys_addr_t mem; - unsigned char *base; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + size_t size = real_mode_size_needed(); + + if (!size) + return; + + WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); + if (!mem) { + pr_info("No sub-1M memory is available for the trampoline\n"); + return; + } - base = __va(mem); memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); + set_real_mode_mem(mem, size); } static void __init setup_real_mode(void) -- cgit v1.2.2 From 5bc653b7318217c54244a14f248f1f07abe0a865 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:17 -0700 Subject: x86/efi: Allocate a trampoline if needed in efi_free_boot_services() On my Dell XPS 13 9350 with firmware 1.4.4 and SGX on, if I boot Fedora 24's grub2-efi off a hard disk, my first 1MB of RAM looks like: efi: mem00: [Runtime Data |RUN| | | | | | | |WB|WT|WC|UC] range=[0x0000000000000000-0x0000000000000fff] (0MB) efi: mem01: [Boot Data | | | | | | | | |WB|WT|WC|UC] range=[0x0000000000001000-0x0000000000027fff] (0MB) efi: mem02: [Loader Data | | | | | | | | |WB|WT|WC|UC] range=[0x0000000000028000-0x0000000000029fff] (0MB) efi: mem03: [Reserved | | | | | | | | |WB|WT|WC|UC] range=[0x000000000002a000-0x000000000002bfff] (0MB) efi: mem04: [Runtime Data |RUN| | | | | | | |WB|WT|WC|UC] range=[0x000000000002c000-0x000000000002cfff] (0MB) efi: mem05: [Loader Data | | | | | | | | |WB|WT|WC|UC] range=[0x000000000002d000-0x000000000002dfff] (0MB) efi: mem06: [Conventional Memory| | | | | | | | |WB|WT|WC|UC] range=[0x000000000002e000-0x0000000000057fff] (0MB) efi: mem07: [Reserved | | | | | | | | |WB|WT|WC|UC] range=[0x0000000000058000-0x0000000000058fff] (0MB) efi: mem08: [Conventional Memory| | | | | | | | |WB|WT|WC|UC] range=[0x0000000000059000-0x000000000009ffff] (0MB) My EBDA is at 0x2c000, which blocks off everything from 0x2c000 and up, and my trampoline is 0x6000 bytes (6 pages), so it doesn't fit in the loader data range at 0x28000. Without this patch, it panics due to a failure to allocate the trampoline. With this patch, it works: [ +0.001744] Base memory trampoline at [ffff880000001000] 1000 size 24576 Signed-off-by: Andy Lutomirski Reviewed-by: Matt Fleming Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/998c77b3bf709f3dfed85cb30701ed1a5d8a438b.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/quirks.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch') diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 4480c06cade7..89d1146f5a6f 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -254,6 +254,7 @@ void __init efi_free_boot_services(void) for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && md->type != EFI_BOOT_SERVICES_DATA) @@ -263,6 +264,26 @@ void __init efi_free_boot_services(void) if (md->attribute & EFI_MEMORY_RUNTIME) continue; + /* + * Nasty quirk: if all sub-1MB memory is used for boot + * services, we can get here without having allocated the + * real mode trampoline. It's too late to hand boot services + * memory back to the memblock allocator, so instead + * try to manually allocate the trampoline if needed. + * + * I've seen this on a Dell XPS 13 9350 with firmware + * 1.4.4 with SGX enabled booting Linux via Fedora 24's + * grub2-efi on a hard disk. (And no, I don't know why + * this happened, but Linux should still try to boot rather + * panicing early.) + */ + rm_size = real_mode_size_needed(); + if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { + set_real_mode_mem(start, rm_size); + start += rm_size; + size -= rm_size; + } + free_bootmem_late(start, size); } -- cgit v1.2.2 From f72075c9eda8a43aeea2f9dbb8d187afd4a76f0b Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Thu, 11 Aug 2016 11:41:59 +0100 Subject: x86/platform/uv: Skip UV runtime services mapping in the efi_runtime_disabled case This problem has actually been in the UV code for a while, but we didn't catch it until recently, because we had been relying on EFI_OLD_MEMMAP to allow our systems to boot for a period of time. We noticed the issue when trying to kexec a recent community kernel, where we hit this NULL pointer dereference in efi_sync_low_kernel_mappings(): [ 0.337515] BUG: unable to handle kernel NULL pointer dereference at 0000000000000880 [ 0.346276] IP: [] efi_sync_low_kernel_mappings+0x5d/0x1b0 The problem doesn't show up with EFI_OLD_MEMMAP because we skip the chunk of setup_efi_state() that sets the efi_loader_signature for the kexec'd kernel. When the kexec'd kernel boots, it won't set EFI_BOOT in setup_arch, so we completely avoid the bug. We always kexec with noefi on the command line, so this shouldn't be an issue, but since we're not actually checking for efi_runtime_disabled in uv_bios_init(), we end up trying to do EFI runtime callbacks when we shouldn't be. This patch just adds a check for efi_runtime_disabled in uv_bios_init() so that we don't map in uv_systab when runtime_disabled == true. Signed-off-by: Alex Thorlton Signed-off-by: Matt Fleming Cc: # v4.7 Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mike Travis Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1470912120-22831-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/bios_uv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 66b2166ea4a1..0df8a0370d32 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -187,7 +187,8 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); void uv_bios_init(void) { uv_systab = NULL; - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) { + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + !efi.uv_systab || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); return; } -- cgit v1.2.2 From d52c0569bab4edc888832df44dc7ac28517134f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 11 Aug 2016 16:08:35 +0200 Subject: x86/apic/x2apic, smp/hotplug: Don't use before alloc in x2apic_cluster_probe() I made a mistake while converting the driver to the hotplug state machine and as a result x2apic_cluster_probe() was accessing cpus_in_cluster before allocating it. This patch fixes it by setting the cpumask after the allocation the memory succeeded. While at it, I marked two functions static which are only used within this file. Reported-by: Laura Abbott Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6b2c28471de5 ("x86/x2apic: Convert to CPU hotplug state machine") Link: http://lkml.kernel.org/r/1470924515-9444-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6368fa69d2af..54f35d988025 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -155,7 +155,7 @@ static void init_x2apic_ldr(void) /* * At CPU state changes, update the x2apic cluster sibling info. */ -int x2apic_prepare_cpu(unsigned int cpu) +static int x2apic_prepare_cpu(unsigned int cpu) { if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) return -ENOMEM; @@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu) return 0; } -int x2apic_dead_cpu(unsigned int this_cpu) +static int x2apic_dead_cpu(unsigned int this_cpu) { int cpu; @@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu) static int x2apic_cluster_probe(void) { int cpu = smp_processor_id(); + int ret; if (!x2apic_mode) return 0; + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + x2apic_prepare_cpu, x2apic_dead_cpu); + if (ret < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); + return 0; + } cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); - cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", - x2apic_prepare_cpu, x2apic_dead_cpu); return 1; } -- cgit v1.2.2 From ad05711cec12131e1277ce749a99d08ecf233aa7 Mon Sep 17 00:00:00 2001 From: "David A. Long" Date: Wed, 10 Aug 2016 16:44:51 -0400 Subject: arm64: Remove stack duplicating code from jprobes Because the arm64 calling standard allows stacked function arguments to be anywhere in the stack frame, do not attempt to duplicate the stack frame for jprobes handler functions. Documentation changes to describe this issue have been broken out into a separate patch in order to simultaneously address them in other architecture(s). Signed-off-by: David A. Long Acked-by: Masami Hiramatsu Acked-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kprobes.h | 2 -- arch/arm64/kernel/probes/kprobes.c | 31 +++++-------------------------- 2 files changed, 5 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index 61b49150dfa3..1737aecfcc5e 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -22,7 +22,6 @@ #define __ARCH_WANT_KPROBES_INSN_SLOT #define MAX_INSN_SIZE 1 -#define MAX_STACK_SIZE 128 #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 @@ -47,7 +46,6 @@ struct kprobe_ctlblk { struct prev_kprobe prev_kprobe; struct kprobe_step_ctx ss_ctx; struct pt_regs jprobe_saved_regs; - char jprobes_stack[MAX_STACK_SIZE]; }; void arch_remove_kprobe(struct kprobe *); diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index bf9768588288..c6b0f40620d8 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -41,18 +41,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); -static inline unsigned long min_stack_size(unsigned long addr) -{ - unsigned long size; - - if (on_irq_stack(addr, raw_smp_processor_id())) - size = IRQ_STACK_PTR(raw_smp_processor_id()) - addr; - else - size = (unsigned long)current_thread_info() + THREAD_START_SP - addr; - - return min(size, FIELD_SIZEOF(struct kprobe_ctlblk, jprobes_stack)); -} - static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { /* prepare insn slot */ @@ -489,20 +477,15 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct jprobe *jp = container_of(p, struct jprobe, kp); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - long stack_ptr = kernel_stack_pointer(regs); kcb->jprobe_saved_regs = *regs; /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. + * Since we can't be sure where in the stack frame "stacked" + * pass-by-value arguments are stored we just don't try to + * duplicate any of the stack. Do not use jprobes on functions that + * use more than 64 bytes (after padding each to an 8 byte boundary) + * of arguments, or pass individual arguments larger than 16 bytes. */ - kasan_disable_current(); - memcpy(kcb->jprobes_stack, (void *)stack_ptr, - min_stack_size(stack_ptr)); - kasan_enable_current(); instruction_pointer_set(regs, (unsigned long) jp->entry); preempt_disable(); @@ -554,10 +537,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } unpause_graph_tracing(); *regs = kcb->jprobe_saved_regs; - kasan_disable_current(); - memcpy((void *)stack_addr, kcb->jprobes_stack, - min_stack_size(stack_addr)); - kasan_enable_current(); preempt_enable_no_resched(); return 1; } -- cgit v1.2.2 From 68187872c76a96ed4db7bfb064272591f02e208b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 11 Aug 2016 17:45:21 +0200 Subject: uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions Since instruction decoder now supports EVEX-encoded instructions, two fixes are needed to correctly handle them in uprobes. Extended bits for MODRM.rm field need to be sanitized just like we do it for VEX3, to avoid encoding wrong register for register-relative access. EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be ignored by the CPU (since GPRs go only up to 15, not 31), but let's be paranoid here: proper encoding for register-relative access should have EVEX.x = 1. Secondly, we should fetch vex.vvvv for EVEX too. This is now super easy because instruction decoder populates vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2. Signed-off-by: Denys Vlasenko Acked-by: Masami Hiramatsu Acked-by: Srikar Dronamraju Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Cc: # v4.1+ Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX") Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31d99ff..495c776de4b4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. -- cgit v1.2.2 From 10e9e7bd598f9a66a11a22514c68c13c41fc821b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Aug 2016 07:30:20 -0700 Subject: perf/x86/intel/uncore: Fix uncore num_counters Some uncore boxes' num_counters value for Haswell server and Broadwell server are not correct (too large, off by one). This issue was found by comparing the code with the document. Although there is no bug report from users yet, accessing non-existent counters is dangerous and the behavior is undefined: it may cause miscounting or even crashes. This patch makes them consistent with the uncore document. Reported-by: Lukasz Odzioba Signed-off-by: Kan Liang Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Link: http://lkml.kernel.org/r/1470925820-59847-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 824e54086e07..8aee83bcf71f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2626,7 +2626,7 @@ void hswep_uncore_cpu_init(void) static struct intel_uncore_type hswep_uncore_ha = { .name = "ha", - .num_counters = 5, + .num_counters = 4, .num_boxes = 2, .perf_ctr_bits = 48, SNBEP_UNCORE_PCI_COMMON_INIT(), @@ -2645,7 +2645,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = { static struct intel_uncore_type hswep_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, @@ -2691,7 +2691,7 @@ static struct intel_uncore_type hswep_uncore_irp = { static struct intel_uncore_type hswep_uncore_qpi = { .name = "qpi", - .num_counters = 5, + .num_counters = 4, .num_boxes = 3, .perf_ctr_bits = 48, .perf_ctr = SNBEP_PCI_PMON_CTR0, @@ -2773,7 +2773,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = { static struct intel_uncore_type hswep_uncore_r3qpi = { .name = "r3qpi", - .num_counters = 4, + .num_counters = 3, .num_boxes = 3, .perf_ctr_bits = 44, .constraints = hswep_uncore_r3qpi_constraints, @@ -2972,7 +2972,7 @@ static struct intel_uncore_type bdx_uncore_ha = { static struct intel_uncore_type bdx_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, -- cgit v1.2.2 From 95f3be798472f63b495ca4712af005ea5ac7aa47 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Aug 2016 07:31:14 -0700 Subject: perf/x86/intel/uncore: Add enable_box for client MSR uncore There are bug reports about miscounting uncore counters on some client machines like Sandybridge, Broadwell and Skylake. It is very likely to be observed on idle systems. This issue is caused by a hardware issue. PERF_GLOBAL_CTL could be cleared after Package C7, and nothing will be count. The related errata (HSD 158) could be found in: www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-desktop-specification-update.pdf This patch tries to work around this issue by re-enabling PERF_GLOBAL_CTL in ->enable_box(). The workaround does not cover all cases. It helps for new events after returning from C7. But it cannot prevent C7, it will still miscount if a counter is already active. There is no drawback in leaving it enabled, so it does not need disable_box() here. Signed-off-by: Kan Liang Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470925874-59943-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snb.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 97a69dbba649..9d35ec0cb8fc 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); +} + static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = { static struct intel_uncore_ops snb_uncore_msr_ops = { .init_box = snb_uncore_msr_init_box, + .enable_box = snb_uncore_msr_enable_box, .exit_box = snb_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, @@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); +} + static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) static struct intel_uncore_ops skl_uncore_msr_ops = { .init_box = skl_uncore_msr_init_box, + .enable_box = skl_uncore_msr_enable_box, .exit_box = skl_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, -- cgit v1.2.2 From 75a4615c95a2de4fa592c1312fb8446e74b4e5eb Mon Sep 17 00:00:00 2001 From: Julius Niedworok Date: Wed, 3 Aug 2016 16:39:54 +0200 Subject: KVM: s390: set the prefix initially properly When KVM_RUN is triggered on a VCPU without an initial reset, a validity intercept occurs. Setting the prefix will set the KVM_REQ_MMU_RELOAD bit initially, thus preventing the bug. Reviewed-by: David Hildenbrand Acked-by: Cornelia Huck Signed-off-by: Julius Niedworok Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3f3ae4865d57..e63f6ed0a936 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1672,6 +1672,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) KVM_SYNC_CRS | KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT; + kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; /* fprs can be synchronized via vrs, even if the guest has no vx. With -- cgit v1.2.2 From aca411a4b17a4aebe14ecdf253373db5b7ee6058 Mon Sep 17 00:00:00 2001 From: Julius Niedworok Date: Wed, 3 Aug 2016 16:39:55 +0200 Subject: KVM: s390: reset KVM_REQ_MMU_RELOAD if mapping the prefix failed When triggering KVM_RUN without a user memory region being mapped (KVM_SET_USER_MEMORY_REGION) a validity intercept occurs. This could happen, if the user memory region was not mapped initially or if it was unmapped after the vcpu is initialized. The function kvm_s390_handle_requests checks for the KVM_REQ_MMU_RELOAD bit. The check function always clears this bit. If gmap_mprotect_notify returns an error code, the mapping failed, but the KVM_REQ_MMU_RELOAD was not set anymore. So the next time kvm_s390_handle_requests is called, the execution would fall trough the check for KVM_REQ_MMU_RELOAD. The bit needs to be resetted, if gmap_mprotect_notify returns an error code. Resetting the bit with kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu) fixes the bug. Reviewed-by: David Hildenbrand Signed-off-by: Julius Niedworok Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e63f6ed0a936..f142215ed30d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2362,8 +2362,10 @@ retry: rc = gmap_mprotect_notify(vcpu->arch.gmap, kvm_s390_get_prefix(vcpu), PAGE_SIZE * 2, PROT_WRITE); - if (rc) + if (rc) { + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); return rc; + } goto retry; } -- cgit v1.2.2 From 023e9fddc3616b005c3753fc1bb6526388cd7a30 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Aug 2016 19:13:00 +0200 Subject: KVM: PPC: Move xics_debugfs_init out of create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we are about to hold the kvm->lock during the create operation on KVM devices, we should move the call to xics_debugfs_init into its own function, since holding a mutex over extended amounts of time might not be a good idea. Introduce an init operation on the kvm_device_ops struct which cannot fail and call this, if configured, after the device has been created. Signed-off-by: Christoffer Dall Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/powerpc/kvm/book3s_xics.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index a75ba38a2d81..f2def8e45fef 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1341,8 +1341,6 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) return ret; } - xics_debugfs_init(xics); - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206)) { /* Enable real mode support */ @@ -1354,9 +1352,17 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) return 0; } +static void kvmppc_xics_init(struct kvm_device *dev) +{ + struct kvmppc_xics *xics = (struct kvmppc_xics *)dev->private; + + xics_debugfs_init(xics); +} + struct kvm_device_ops kvm_xics_ops = { .name = "kvm-xics", .create = kvmppc_xics_create, + .init = kvmppc_xics_init, .destroy = kvmppc_xics_free, .set_attr = xics_set_attr, .get_attr = xics_get_attr, -- cgit v1.2.2 From a28ebea2adc4a2bef5989a5a181ec238f59fbcad Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Aug 2016 19:13:01 +0200 Subject: KVM: Protect device ops->create and list_add with kvm->lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM devices were manipulating list data structures without any form of synchronization, and some implementations of the create operations also suffered from a lack of synchronization. Now when we've split the xics create operation into create and init, we can hold the kvm->lock mutex while calling the create operation and when manipulating the devices list. The error path in the generic code gets slightly ugly because we have to take the mutex again and delete the device from the list, but holding the mutex during anon_inode_getfd or releasing/locking the mutex in the common non-error path seemed wrong. Signed-off-by: Christoffer Dall Reviewed-by: Paolo Bonzini Acked-by: Christian Borntraeger Signed-off-by: Radim Krčmář --- arch/arm/kvm/arm.c | 6 +++++- arch/powerpc/kvm/book3s_xics.c | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index d94bb9093ead..75f130ef6504 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1009,9 +1009,13 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { + int ret; if (!vgic_present) return -ENXIO; - return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_lock(&kvm->lock); + ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + mutex_unlock(&kvm->lock); + return ret; } case KVM_ARM_SET_DEVICE_ADDR: { struct kvm_arm_device_addr dev_addr; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index f2def8e45fef..05aa11399a78 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1329,12 +1329,10 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) xics->kvm = kvm; /* Already there ? */ - mutex_lock(&kvm->lock); if (kvm->arch.xics) ret = -EEXIST; else kvm->arch.xics = xics; - mutex_unlock(&kvm->lock); if (ret) { kfree(xics); -- cgit v1.2.2 From c604cffa93478f8888bec62b23d6073dad03d43a Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 11 Aug 2016 11:58:12 +0100 Subject: MIPS: KVM: Fix mapped fault broken commpage handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_mips_handle_mapped_seg_tlb_fault() appears to map the guest page at virtual address 0 to PFN 0 if the guest has created its own mapping there. The intention is unclear, but it may have been an attempt to protect the zero page from being mapped to anything but the comm page in code paths you wouldn't expect from genuine commpage accesses (guest kernel mode cache instructions on that address, hitting trapping instructions when executing from that address with a coincidental TLB eviction during the KVM handling, and guest user mode accesses to that address). Fix this to check for mappings exactly at KVM_GUEST_COMMPAGE_ADDR (it may not be at address 0 since commit 42aa12e74e91 ("MIPS: KVM: Move commpage so 0x0 is unmapped")), and set the corresponding EntryLo to be interpreted as 0 (invalid). Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: # 3.10.x- Signed-off-by: Radim Krčmář --- arch/mips/kvm/mmu.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 57319ee57c4f..96e0b24cfe5c 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -138,35 +138,42 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; kvm_pfn_t pfn0, pfn1; + long tlb_lo[2]; int ret; - if ((tlb->tlb_hi & VPN2_MASK) == 0) { - pfn0 = 0; - pfn1 = 0; - } else { - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) - >> PAGE_SHIFT) < 0) - return -1; - - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) - >> PAGE_SHIFT) < 0) - return -1; - - pfn0 = kvm->arch.guest_pmap[ - mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[ - mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT]; - } + tlb_lo[0] = tlb->tlb_lo[0]; + tlb_lo[1] = tlb->tlb_lo[1]; + + /* + * The commpage address must not be mapped to anything else if the guest + * TLB contains entries nearby, or commpage accesses will break. + */ + if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) & + VPN2_MASK & (PAGE_MASK << 1))) + tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0; + + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[0]) + >> PAGE_SHIFT) < 0) + return -1; + + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[1]) + >> PAGE_SHIFT) < 0) + return -1; + + pfn0 = kvm->arch.guest_pmap[ + mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT]; + pfn1 = kvm->arch.guest_pmap[ + mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT]; /* Get attributes from the Guest TLB */ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - (tlb->tlb_lo[0] & ENTRYLO_D) | - (tlb->tlb_lo[0] & ENTRYLO_V); + (tlb_lo[0] & ENTRYLO_D) | + (tlb_lo[0] & ENTRYLO_V); entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - (tlb->tlb_lo[1] & ENTRYLO_D) | - (tlb->tlb_lo[1] & ENTRYLO_V); + (tlb_lo[1] & ENTRYLO_D) | + (tlb_lo[1] & ENTRYLO_V); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo[0], tlb->tlb_lo[1]); -- cgit v1.2.2 From 8985d50382359e5bf118fdbefc859d0dbf6cebc7 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 11 Aug 2016 11:58:13 +0100 Subject: MIPS: KVM: Add missing gfn range check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_mips_handle_mapped_seg_tlb_fault() calculates the guest frame number based on the guest TLB EntryLo values, however it is not range checked to ensure it lies within the guest_pmap. If the physical memory the guest refers to is out of range then dump the guest TLB and emit an internal error. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: # 3.10.x- Signed-off-by: Radim Krčmář --- arch/mips/kvm/mmu.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 96e0b24cfe5c..701c1b64cef8 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -138,6 +138,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; kvm_pfn_t pfn0, pfn1; + gfn_t gfn0, gfn1; long tlb_lo[2]; int ret; @@ -152,18 +153,24 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, VPN2_MASK & (PAGE_MASK << 1))) tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0; - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[0]) - >> PAGE_SHIFT) < 0) + gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT; + gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT; + if (gfn0 >= kvm->arch.guest_pmap_npages || + gfn1 >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n", + __func__, gfn0, gfn1, tlb->tlb_hi); + kvm_mips_dump_guest_tlbs(vcpu); + return -1; + } + + if (kvm_mips_map_page(kvm, gfn0) < 0) return -1; - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[1]) - >> PAGE_SHIFT) < 0) + if (kvm_mips_map_page(kvm, gfn1) < 0) return -1; - pfn0 = kvm->arch.guest_pmap[ - mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[ - mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT]; + pfn0 = kvm->arch.guest_pmap[gfn0]; + pfn1 = kvm->arch.guest_pmap[gfn1]; /* Get attributes from the Guest TLB */ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | -- cgit v1.2.2 From 0741f52d1b980dbeb290afe67d88fc2928edd8ab Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 11 Aug 2016 11:58:14 +0100 Subject: MIPS: KVM: Fix gfn range check in kseg0 tlb faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two consecutive gfns are loaded into host TLB, so ensure the range check isn't off by one if guest_pmap_npages is odd. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: # 3.10.x- Signed-off-by: Radim Krčmář --- arch/mips/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 701c1b64cef8..6a8a21859502 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -99,7 +99,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, } gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); - if (gfn >= kvm->arch.guest_pmap_npages) { + if ((gfn | 1) >= kvm->arch.guest_pmap_npages) { kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, gfn, badvaddr); kvm_mips_dump_host_tlbs(); -- cgit v1.2.2 From 9b731bcfdec4c159ad2e4312e25d69221709b96a Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 11 Aug 2016 11:58:15 +0100 Subject: MIPS: KVM: Propagate kseg0/mapped tlb fault errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Propagate errors from kvm_mips_handle_kseg0_tlb_fault() and kvm_mips_handle_mapped_seg_tlb_fault(), usually triggering an internal error since they normally indicate the guest accessed bad physical memory or the commpage in an unexpected way. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Fixes: e685c689f3a8 ("KVM/MIPS32: Privileged instruction/target branch emulation.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: # 3.10.x- Signed-off-by: Radim Krčmář --- arch/mips/kvm/emulate.c | 35 ++++++++++++++++++++++++++--------- arch/mips/kvm/mmu.c | 12 +++++++++--- 2 files changed, 35 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 6eb52b9c9818..e788515f766b 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1642,8 +1642,14 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, preempt_disable(); if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { - if (kvm_mips_host_tlb_lookup(vcpu, va) < 0) - kvm_mips_handle_kseg0_tlb_fault(va, vcpu); + if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 && + kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) { + kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n", + __func__, va, vcpu, read_c0_entryhi()); + er = EMULATE_FAIL; + preempt_enable(); + goto done; + } } else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) || KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { int index; @@ -1680,12 +1686,18 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, run, vcpu); preempt_enable(); goto dont_update_pc; - } else { - /* - * We fault an entry from the guest tlb to the - * shadow host TLB - */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb); + } + /* + * We fault an entry from the guest tlb to the + * shadow host TLB + */ + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { + kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", + __func__, va, index, vcpu, + read_c0_entryhi()); + er = EMULATE_FAIL; + preempt_enable(); + goto done; } } } else { @@ -2659,7 +2671,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb); + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { + kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", + __func__, va, index, vcpu, + read_c0_entryhi()); + er = EMULATE_FAIL; + } } } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6a8a21859502..6cfdcf55572d 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -368,9 +368,15 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) local_irq_restore(flags); return KVM_INVALID_INST; } - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch. - guest_tlb[index]); + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, + &vcpu->arch.guest_tlb[index])) { + kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n", + __func__, opc, index, vcpu, + read_c0_entryhi()); + kvm_mips_dump_guest_tlbs(vcpu); + local_irq_restore(flags); + return KVM_INVALID_INST; + } inst = *(opc); } local_irq_restore(flags); -- cgit v1.2.2 From 9adeb8e72dbfe976709df01e259ed556ee60e779 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 9 Aug 2016 18:25:26 -0700 Subject: arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 7 +++++++ arch/arm64/mm/fault.c | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 96e4a2b64cc1..441420ca7d08 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -353,6 +353,8 @@ el1_sync: lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1 b.eq el1_da + cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1 + b.eq el1_ia cmp x24, #ESR_ELx_EC_SYS64 // configurable trap b.eq el1_undef cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception @@ -364,6 +366,11 @@ el1_sync: cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1 b.ge el1_dbg b el1_inv + +el1_ia: + /* + * Fall through to the Data abort case + */ el1_da: /* * Data abort handling diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c8beaa0da7df..05d2bd776c69 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -153,6 +153,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, } #endif +static bool is_el1_instruction_abort(unsigned int esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; +} + /* * The kernel tried to access some page that wasn't present. */ @@ -161,8 +166,9 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, { /* * Are we prepared to handle this kernel fault? + * We are almost certainly not prepared to handle instruction faults. */ - if (fixup_exception(regs)) + if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) return; /* @@ -267,7 +273,8 @@ static inline bool is_permission_fault(unsigned int esr) unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || + (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } static bool is_el0_instruction_abort(unsigned int esr) @@ -312,6 +319,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); + if (is_el1_instruction_abort(esr)) + die("Attempting to execute userspace memory", regs, esr); + if (!search_exception_tables(regs->pc)) die("Accessing user space memory outside uaccess.h routines", regs, esr); } -- cgit v1.2.2 From 0194e760f7d2f42adb5e1db31b27a4331dd89c2f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 11 Aug 2016 14:11:05 +0100 Subject: arm64: hibernate: avoid potential TLB conflict In create_safe_exec_page we install a set of global mappings in TTBR0, then subsequently invalidate TLBs. While TTBR0 points at the zero page, and the TLBs should be free of stale global entries, we may have stale ASID-tagged entries (e.g. from the EFI runtime services mappings) for the same VAs. Per the ARM ARM these ASID-tagged entries may conflict with newly-allocated global entries, and we must follow a Break-Before-Make approach to avoid issues resulting from this. This patch reworks create_safe_exec_page to invalidate TLBs while the zero page is still in place, ensuring that there are no potential conflicts when the new TTBR0 value is installed. As a single CPU is online while this code executes, we do not need to perform broadcast TLB maintenance, and can call local_flush_tlb_all(), which also subsumes some barriers. The remaining assembly is converted to use write_sysreg() and isb(). Other than this, we safely manipulate TTBRs in the hibernate dance. The code we install as part of the new TTBR0 mapping (the hibernated kernel's swsusp_arch_suspend_exit) installs a zero page into TTBR1, invalidates TLBs, then installs its preferred value. Upon being restored to the middle of swsusp_arch_suspend, the new image will call __cpu_suspend_exit, which will call cpu_uninstall_idmap, installing the zero page in TTBR0 and invalidating all TLB entries. Fixes: 82869ac57b5d ("arm64: kernel: Add support for hibernate/suspend-to-disk") Signed-off-by: Mark Rutland Acked-by: James Morse Tested-by: James Morse Cc: Lorenzo Pieralisi Cc: Will Deacon Cc: # 4.7+ Signed-off-by: Catalin Marinas --- arch/arm64/kernel/hibernate.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 21ab5df9fa76..b2e7de8726e8 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -35,6 +35,7 @@ #include #include #include +#include #include /* @@ -217,12 +218,22 @@ static int create_safe_exec_page(void *src_start, size_t length, set_pte(pte, __pte(virt_to_phys((void *)dst) | pgprot_val(PAGE_KERNEL_EXEC))); - /* Load our new page tables */ - asm volatile("msr ttbr0_el1, %0;" - "isb;" - "tlbi vmalle1is;" - "dsb ish;" - "isb" : : "r"(virt_to_phys(pgd))); + /* + * Load our new page tables. A strict BBM approach requires that we + * ensure that TLBs are free of any entries that may overlap with the + * global mappings we are about to install. + * + * For a real hibernate/resume cycle TTBR0 currently points to a zero + * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI + * runtime services), while for a userspace-driven test_resume cycle it + * points to userspace page tables (and we must point it at a zero page + * ourselves). Elsewhere we only (un)install the idmap with preemption + * disabled, so T0SZ should be as required regardless. + */ + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + write_sysreg(virt_to_phys(pgd), ttbr0_el1); + isb(); *phys_dst_addr = virt_to_phys((void *)dst); -- cgit v1.2.2 From dfbca61af0b654990b9af8297ac574a9986d8275 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 11 Aug 2016 14:11:06 +0100 Subject: arm64: hibernate: handle allocation failures In create_safe_exec_page(), we create a copy of the hibernate exit text, along with some page tables to map this via TTBR0. We then install the new tables in TTBR0. In swsusp_arch_resume() we call create_safe_exec_page() before trying a number of operations which may fail (e.g. copying the linear map page tables). If these fail, we bail out of swsusp_arch_resume() and return an error code, but leave TTBR0 as-is. Subsequently, the core hibernate code will call free_basic_memory_bitmaps(), which will free all of the memory allocations we made, including the page tables installed in TTBR0. Thus, we may have TTBR0 pointing at dangling freed memory for some period of time. If the hibernate attempt was triggered by a user requesting a hibernate test via the reboot syscall, we may return to userspace with the clobbered TTBR0 value. Avoid these issues by reorganising swsusp_arch_resume() such that we have no failure paths after create_safe_exec_page(). We also add a check that the zero page allocation succeeded, matching what we have for other allocations. Fixes: 82869ac57b5d ("arm64: kernel: Add support for hibernate/suspend-to-disk") Signed-off-by: Mark Rutland Acked-by: James Morse Cc: Lorenzo Pieralisi Cc: Will Deacon Cc: # 4.7+ Signed-off-by: Catalin Marinas --- arch/arm64/kernel/hibernate.c | 59 +++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index b2e7de8726e8..65d81f965e74 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -404,6 +404,38 @@ int swsusp_arch_resume(void) void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + /* + * Restoring the memory image will overwrite the ttbr1 page tables. + * Create a second copy of just the linear map, and use this when + * restoring. + */ + tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!tmp_pg_dir) { + pr_err("Failed to allocate memory for temporary page tables."); + rc = -ENOMEM; + goto out; + } + rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0); + if (rc) + goto out; + + /* + * Since we only copied the linear map, we need to find restore_pblist's + * linear map address. + */ + lm_restore_pblist = LMADDR(restore_pblist); + + /* + * We need a zero page that is zero before & after resume in order to + * to break before make on the ttbr1 page tables. + */ + zero_page = (void *)get_safe_page(GFP_ATOMIC); + if (!zero_page) { + pr_err("Failed to allocate zero page."); + rc = -ENOMEM; + goto out; + } + /* * Locate the exit code in the bottom-but-one page, so that *NULL * still has disastrous affects. @@ -429,27 +461,6 @@ int swsusp_arch_resume(void) */ __flush_dcache_area(hibernate_exit, exit_size); - /* - * Restoring the memory image will overwrite the ttbr1 page tables. - * Create a second copy of just the linear map, and use this when - * restoring. - */ - tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!tmp_pg_dir) { - pr_err("Failed to allocate memory for temporary page tables."); - rc = -ENOMEM; - goto out; - } - rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0); - if (rc) - goto out; - - /* - * Since we only copied the linear map, we need to find restore_pblist's - * linear map address. - */ - lm_restore_pblist = LMADDR(restore_pblist); - /* * KASLR will cause the el2 vectors to be in a different location in * the resumed kernel. Load hibernate's temporary copy into el2. @@ -464,12 +475,6 @@ int swsusp_arch_resume(void) __hyp_set_vectors(el2_vectors); } - /* - * We need a zero page that is zero before & after resume in order to - * to break before make on the ttbr1 page tables. - */ - zero_page = (void *)get_safe_page(GFP_ATOMIC); - hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1, resume_hdr.reenter_kernel, lm_restore_pblist, resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page)); -- cgit v1.2.2 From 2323439fd0c844b0e84d16968699c419809d480e Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Wed, 15 Jun 2016 11:27:59 +0300 Subject: arm64: defconfig: add options for virtualization and containers Enable options commonly needed by popular virtualization and container applications. Use modules when possible to avoid too much overhead for users not interested. - add namespace and cgroup options needed - add seccomp - optional, but enhances Qemu etc - bridge, nat, veth, macvtap and multicast for routing guests and containers - btfrs and overlayfs modules for container COW backends - while near it, make fuse a module instead of built-in. Generated with make saveconfig and dropping unrelated spurious change hunks while commiting. bloat-o-meter old-vmlinux vmlinux: add/remove: 905/390 grow/shrink: 767/229 up/down: 183513/-94861 (88652) .... Total: Before=10515408, After=10604060, chg +0.84% Signed-off-by: Riku Voipio Signed-off-by: Catalin Marinas --- arch/arm64/configs/defconfig | 52 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 0555b7caaf2c..2016444601f8 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -15,10 +15,14 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_HUGETLB=y -# CONFIG_UTS_NS is not set -# CONFIG_IPC_NS is not set -# CONFIG_NET_NS is not set +CONFIG_CPUSETS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_USER_NS=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y @@ -71,6 +75,7 @@ CONFIG_PREEMPT=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA=y +CONFIG_SECCOMP=y CONFIG_XEN=y CONFIG_KEXEC=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set @@ -84,10 +89,37 @@ CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y +CONFIG_IP_MULTICAST=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_IP_PNP_BOOTP=y -# CONFIG_IPV6 is not set +CONFIG_IPV6=m +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_MANGLE=m +CONFIG_NF_CONNTRACK_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y CONFIG_BPF_JIT=y CONFIG_CFG80211=m CONFIG_MAC80211=m @@ -103,6 +135,7 @@ CONFIG_MTD=y CONFIG_MTD_M25P80=y CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_NBD=m CONFIG_VIRTIO_BLK=y CONFIG_SRAM=y # CONFIG_SCSI_PROC_FS is not set @@ -120,7 +153,10 @@ CONFIG_SATA_SIL24=y CONFIG_PATA_PLATFORM=y CONFIG_PATA_OF_PLATFORM=y CONFIG_NETDEVICES=y +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m CONFIG_TUN=y +CONFIG_VETH=m CONFIG_VIRTIO_NET=y CONFIG_AMD_XGBE=y CONFIG_NET_XGENE=y @@ -350,12 +386,16 @@ CONFIG_EXYNOS_ADC=y CONFIG_PWM_SAMSUNG=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y CONFIG_AUTOFS4_FS=y -CONFIG_FUSE_FS=y -CONFIG_CUSE=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m CONFIG_VFAT_FS=y CONFIG_TMPFS=y CONFIG_HUGETLBFS=y -- cgit v1.2.2 From 53fb45d3df6fb64eff6c314b3fa2e279a2496e5b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 8 Jun 2016 21:26:23 +0900 Subject: arm64: defconfig: enable CONFIG_LOCALVERSION_AUTO When CONFIG_LOCALVERSION_AUTO is disabled, the version string is just a tag name (or with a '+' appended if HEAD is not a tagged commit). During the development (and especially when git-bisecting), longer version string would be helpful to identify the commit we are running. This is a default y option, so drop the unset to enable it. Signed-off-by: Masahiro Yamada Signed-off-by: Catalin Marinas --- arch/arm64/configs/defconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 2016444601f8..eadf4855ad2d 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1,4 +1,3 @@ -# CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y -- cgit v1.2.2 From 783011b13095430262333fd64e17666954064664 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 21 Mar 2016 04:20:53 -0700 Subject: unicore32: mm: Add missing parameter to arch_vma_access_permitted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unicore32 fails to compile with the following errors. mm/memory.c: In function ‘__handle_mm_fault’: mm/memory.c:3381: error: too many arguments to function ‘arch_vma_access_permitted’ mm/gup.c: In function ‘check_vma_flags’: mm/gup.c:456: error: too many arguments to function ‘arch_vma_access_permitted’ mm/gup.c: In function ‘vma_permits_fault’: mm/gup.c:640: error: too many arguments to function ‘arch_vma_access_permitted’ Fixes: d61172b4b695b ("mm/core, x86/mm/pkeys: Differentiate instruction fetches") Cc: Dave Hansen Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Guenter Roeck Acked-by: Guan Xuetao --- arch/unicore32/include/asm/mmu_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index e35632ef23c7..62dfc644c908 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -98,7 +98,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, } static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool foreign) + bool write, bool execute, bool foreign) { /* by default, allow everything */ return true; -- cgit v1.2.2 From 2b05980d895e51fc46d3d27c8d1bcc696dddca99 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 8 Jun 2016 20:11:58 -0700 Subject: h8300: Add missing include file to asm/io.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit h8300 builds fail with arch/h8300/include/asm/io.h:9:15: error: unknown type name ‘u8’ arch/h8300/include/asm/io.h:15:15: error: unknown type name ‘u16’ arch/h8300/include/asm/io.h:21:15: error: unknown type name ‘u32’ and many related errors. Fixes: 23c82d41bdf4 ("kexec-allow-architectures-to-override-boot-mapping-fix") Cc: Andrew Morton Signed-off-by: Guenter Roeck --- arch/h8300/include/asm/io.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/h8300/include/asm/io.h b/arch/h8300/include/asm/io.h index 2e221c5f0203..f86918aed9e1 100644 --- a/arch/h8300/include/asm/io.h +++ b/arch/h8300/include/asm/io.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include + /* H8/300 internal I/O functions */ #define __raw_readb __raw_readb -- cgit v1.2.2 From 5d87f493ddb1b86a0569fa3c4037fa9efc0c7183 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 Aug 2016 04:07:32 +0200 Subject: x86/power/64: Use __pa() for physical address computation The value of temp_level4_pgt is the physical address of the top-level page directory, so use __pa() to compute it. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- arch/x86/power/hibernate_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a3e3ccc87138..9634557a5444 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -113,7 +113,7 @@ static int set_up_temporary_mappings(void) return result; } - temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET; + temp_level4_pgt = __pa(pgd); return 0; } -- cgit v1.2.2 From 172b1d6b5a9337eb8c1ec294b80e448e03a9ac17 Mon Sep 17 00:00:00 2001 From: Xiaodong Liu Date: Fri, 12 Aug 2016 06:24:42 -0400 Subject: crypto: sha256-mb - fix ctx pointer and digest copy 1. fix ctx pointer Use req_ctx which is the ctx for the next job that have been completed in the lanes instead of the first completed job rctx, whose completion could have been called and released. 2. fix digest copy Use XMM register to copy another 16 bytes sha256 digest instead of a regular register. Signed-off-by: Xiaodong Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-mb/sha256_mb.c | 4 ++-- arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 89fa85e8b10c..6f97fb33ae21 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -485,10 +485,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index b691da981cd9..a78a0694ddef 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -265,13 +265,14 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 - movl _args_digest+4*32(state, idx, 4), tmp2_w + vmovd _args_digest(state , idx, 4) , %xmm0 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 - vmovdqu %xmm0, _result_digest(job_rax) - movl tmp2_w, _result_digest+1*16(job_rax) + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + vmovdqu %xmm1, offset(job_rax) pop %rbx -- cgit v1.2.2 From e67479b13ede47cc2f5beb5b51e67fdb30778ee8 Mon Sep 17 00:00:00 2001 From: Xiaodong Liu Date: Fri, 12 Aug 2016 06:28:31 -0400 Subject: crypto: sha512-mb - fix ctx pointer 1. fix ctx pointer Use req_ctx which is the ctx for the next job that have been completed in the lanes instead of the first completed job rctx, whose completion could have been called and released. Signed-off-by: Xiaodong Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-mb/sha512_mb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index f4cf5b78fd36..d210174a52b0 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -497,10 +497,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } -- cgit v1.2.2 From 9ac715954682b23d293d910ad2697554171035e7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 17 Aug 2016 10:46:10 +0200 Subject: KVM: arm/arm64: Change misleading use of is_error_pfn When converting a gfn to a pfn, we call gfn_to_pfn_prot, which returns various kinds of error values. It turns out that is_error_pfn() only returns true when the gfn was found in a memory slot and could somehow not be used, but it does not return true if the gfn does not belong to any memory slot. Change use to is_error_noslot_pfn() which covers both cases. Note: Since we already check for kvm_is_error_hva(hva) explicitly in the caller of this function while holding the kvm->srcu lock protecting the memory slots, this should never be a problem, but nevertheless this change is warranted as it shows the intention of the code. Reported-by: James Hogan Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index bda27b6b1aa2..29d0b23af2a9 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1309,7 +1309,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, smp_rmb(); pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); - if (is_error_pfn(pfn)) + if (is_error_noslot_pfn(pfn)) return -EFAULT; if (kvm_is_device_pfn(pfn)) { -- cgit v1.2.2 From 674e70127069f3fd3c58fb0f94c60eb0f6567d78 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Aug 2016 15:03:01 +0100 Subject: arm64: Document workaround for Cortex-A72 erratum #853709 We already have a workaround for Cortex-A57 erratum #852523, but Cortex-A72 r0p0 to r0p2 do suffer from the same issue (known as erratum #853709). Let's document the fact that we already handle this. Acked-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ae7855f16ec2..5a84b4562603 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -256,7 +256,7 @@ static int __hyp_text __guest_run(struct kvm_vcpu *vcpu) /* * We must restore the 32-bit state before the sysregs, thanks - * to Cortex-A57 erratum #852523. + * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). */ __sysreg32_restore_state(vcpu); __sysreg_restore_guest_state(guest_ctxt); -- cgit v1.2.2 From b63bebe2355cf2632a2979fd2982c88d080c44b6 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 10 Aug 2016 10:49:42 +0100 Subject: arm64: KVM: remove misleading comment on pmu status Comment about how PMU access is handled is not relavant since v4.6 where proper PMU support was added in. Signed-off-by: Vladimir Murzin Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kvm/sys_regs.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index b0b225ceca18..af5ea86d1c19 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -823,14 +823,6 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 * - * We could trap ID_DFR0 and tell the guest we don't support performance - * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was - * NAKed, so it will read the PMCR anyway. - * - * Therefore we tell the guest we have 0 counters. Unfortunately, we - * must always support PMCCNTR (the cycle counter): we just RAZ/WI for - * all PM registers, which doesn't crash the guest kernel at least. - * * Debug handling: We do trap most, if not all debug related system * registers. The implementation is good enough to ensure that a guest * can use these with minimal performance degradation. The drawback is -- cgit v1.2.2 From f7f6f2d94f0027242ddfd665289b107a873fde43 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 10 Aug 2016 10:49:43 +0100 Subject: arm64: KVM: report configured SRE value to 32-bit world After commit b34f2bc ("arm64: KVM: Make ICC_SRE_EL1 access return the configured SRE value") we report SRE value to 64-bit guest, but 32-bit one still handled as RAZ/WI what leads to funny promise we do not keep: "GICv3: GIC: unable to set SRE (disabled at EL2), panic ahead" Instead, return the actual value of the ICC_SRE_EL1 register that the guest should see. [ Tweaked commit message - Christoffer ] Signed-off-by: Vladimir Murzin Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index af5ea86d1c19..e51367d159d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1352,7 +1352,7 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 }, /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), trap_raz_wi }, + { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, -- cgit v1.2.2 From bfe6c8a89e03f52bccf922f74ced8fe959e7fd36 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 15 Aug 2016 16:33:10 +0100 Subject: arm64: Fix NUMA build error when !CONFIG_ACPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since asm/acpi.h is only included by linux/acpi.h when CONFIG_ACPI is enabled, disabling the latter leads to the following build error on arm64: arch/arm64/mm/numa.c: In function ‘arm64_numa_init’: arch/arm64/mm/numa.c:395:24: error: ‘arm64_acpi_numa_init’ undeclared (first use in this function) if (!acpi_disabled && !numa_init(arm64_acpi_numa_init)) This patch include the asm/acpi.h explicitly in arch/arm64/mm/numa.c for the arm64_acpi_numa_init() definition. Fixes: d8b47fca8c23 ("arm64, ACPI, NUMA: NUMA support based on SRAT and SLIT") Reviewed-by: Hanjun Guo Signed-off-by: Catalin Marinas --- arch/arm64/mm/numa.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index c7fe3ec70774..5bb15eab6f00 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -23,6 +23,8 @@ #include #include +#include + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); nodemask_t numa_nodes_parsed __initdata; -- cgit v1.2.2 From bc9f3d7788a88d080a30599bde68f383daf8f8a5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 17 Aug 2016 17:54:41 +0200 Subject: arm64: kernel: avoid literal load of virtual address with MMU off Literal loads of virtual addresses are subject to runtime relocation when CONFIG_RELOCATABLE=y, and given that the relocation routines run with the MMU and caches enabled, literal loads of relocated values performed with the MMU off are not guaranteed to return the latest value unless the memory covering the literal is cleaned to the PoC explicitly. So defer the literal load until after the MMU has been enabled, just like we do for primary_switch() and secondary_switch() in head.S. Fixes: 1e48ef7fcc37 ("arm64: add support for building vmlinux as a relocatable PIE binary") Cc: # 4.6+ Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/kernel/sleep.S | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 9a3aec97ac09..ccf79d849e0a 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -101,12 +101,20 @@ ENTRY(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly /* enable the MMU early - so we can access sleep_save_stash by va */ adr_l lr, __enable_mmu /* __cpu_setup will return here */ - ldr x27, =_cpu_resume /* __enable_mmu will branch here */ + adr_l x27, _resume_switched /* __enable_mmu will branch here */ adrp x25, idmap_pg_dir adrp x26, swapper_pg_dir b __cpu_setup ENDPROC(cpu_resume) + .pushsection ".idmap.text", "ax" +_resume_switched: + ldr x8, =_cpu_resume + br x8 +ENDPROC(_resume_switched) + .ltorg + .popsection + ENTRY(_cpu_resume) mrs x1, mpidr_el1 adrp x8, mpidr_hash -- cgit v1.2.2 From 88b2f634028f1f38dcc3d412e10ff1f224976daa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 17 Aug 2016 13:33:14 +0200 Subject: x86/microcode/AMD: Fix initrd loading with CONFIG_RANDOMIZE_MEMORY=y Similar to: efaad554b4ff ("x86/microcode/intel: Fix initrd loading with CONFIG_RANDOMIZE_MEMORY=y") ... fix microcode loading from the initrd on AMD by adding the randomization offset to the microcode patch container within the initrd. Reported-and-tested-by: Brian Gerst Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-tip-commits@vger.kernel.org Link: http://lkml.kernel.org/r/20160817113314.GA19221@nazgul.tnic Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 27a0228c9cae..b816971f5da4 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -355,6 +355,7 @@ void load_ucode_amd_ap(void) unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; + u8 *cont = container; u32 rev, eax; u16 eq_id; @@ -371,8 +372,11 @@ void load_ucode_amd_ap(void) if (check_current_patch_level(&rev, false)) return; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -434,6 +438,9 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -- cgit v1.2.2 From 7b0501b1e7cddd32b265178e32d332bdfbb532d4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 15 Aug 2016 12:17:00 +0200 Subject: x86/smp: Fix __max_logical_packages value setup Frank reported kernel panic when he disabled several cores in BIOS via following option: Core Disable Bitmap(Hex) [0] with number 0xFFE, which leaves 16 CPUs in system (out of 48). The kernel panic below goes along with following messages: smpboot: Max logical packages: 2^M smpboot: APIC(0) Converting physical 0 to logical package 0^M smpboot: APIC(20) Converting physical 1 to logical package 1^M smpboot: APIC(40) Package 2 exceeds logical package map^M smpboot: CPU 8 APICId 40 disabled^M smpboot: APIC(60) Package 3 exceeds logical package map^M smpboot: CPU 12 APICId 60 disabled^M ... general protection fault: 0000 [#1] SMP^M Modules linked in:^M CPU: 15 PID: 1 Comm: swapper/0 Not tainted 4.7.0-rc5+ #1^M Hardware name: SGI UV300/UV300, BIOS SGI UV 300 series BIOS 05/25/2016^M task: ffff8801673e0000 ti: ffff8801673ac000 task.ti: ffff8801673ac000^M RIP: 0010:[] [] uncore_change_context+0xd4/0x180^M ... [] uncore_event_init_cpu+0x6c/0x70^M [] intel_uncore_init+0x1c2/0x2dd^M [] ? uncore_cpu_setup+0x17/0x17^M [] do_one_initcall+0x50/0x190^M [] ? parse_args+0x293/0x480^M [] kernel_init_freeable+0x1a5/0x249^M [] ? set_debug_rodata+0x12/0x12^M [] kernel_init+0xe/0x110^M [] ret_from_fork+0x1f/0x40^M [] ? rest_init+0x80/0x80^M The reason for the panic is wrong value of __max_logical_packages, which lets logical_package_map uninitialized and the uncore code relying on this map being properly initialized (maybe we should add some safety checks there as well). The __max_logical_packages is computed as: DIV_ROUND_UP(total_cpus, ncpus); - ncpus being number of cores With above BIOS setup we get total_cpus == 16 which set __max_logical_packages to 2 (ncpus is 12). Once topology_update_package_map processes CPU with logical pkg over 2 we display above messages and fail to initialize the physical_to_logical_pkg map, which makes the uncore code crash. The fix is to remove logical_package_map bitmap completely and keep and update the logical_packages number instead. After we enumerate all the present CPUs, we check if the enumerated logical packages count is within its computed maximum from BIOS data. If it's not the case, we set this maximum to the new enumerated value and freeze any new addition of logical packages. The freeze is because lot of init code like uncore/rapl/cqm depends on having maximum logical package value set to allocate their data, so we can't change it later on. Prarit Bhargava tested the patch and confirms that it solves the problem: From dmidecode: Core Count: 24 Core Enabled: 24 Thread Count: 48 Orig kernel boot log: [ 0.464981] smpboot: Max logical packages: 19 [ 0.469861] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.477261] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.484760] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.492258] smpboot: APIC(c0) Converting physical 3 to logical package 3 1. nr_cpus=8, should stop enumerating in package 0: [ 0.533664] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.539596] smpboot: Max logical packages: 19 2. max_cpus=8, should still enumerate all packages: [ 0.526494] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.532428] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.538456] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.544486] smpboot: APIC(c0) Converting physical 3 to logical package 3 [ 0.550524] smpboot: Max logical packages: 19 3. nr_cpus=49 ( 2 socket + 1 core on 3rd socket), should stop enumerating in package 2: [ 0.521378] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.527314] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.533345] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.539368] smpboot: Max logical packages: 19 4. maxcpus=49, should still enumerate all packages: [ 0.525591] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.531525] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.537547] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.543579] smpboot: APIC(c0) Converting physical 3 to logical package 3 [ 0.549624] smpboot: Max logical packages: 19 5. kdump (nr_cpus=1) works as well. Reported-by: Frank Ramsay Tested-by: Prarit Bhargava Signed-off-by: Jiri Olsa Reviewed-by: Prarit Bhargava Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160815101700.GA30090@krava Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a6e84a30a54..4296beb8fdd3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -100,10 +100,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ static int *physical_to_logical_pkg __read_mostly; static unsigned long *physical_package_map __read_mostly;; -static unsigned long *logical_package_map __read_mostly; static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; @@ -277,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - new = find_first_zero_bit(logical_package_map, __max_logical_packages); - if (new >= __max_logical_packages) { + if (logical_packages_frozen) { physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package map\n", + pr_warn("APIC(%x) Package %u exceeds logical package max\n", apicid, pkg); return -ENOSPC; } - set_bit(new, logical_package_map); + + new = logical_packages++; pr_info("APIC(%x) Converting physical %u to logical package %u\n", apicid, pkg, new); physical_to_logical_pkg[pkg] = new; @@ -341,6 +342,7 @@ static void __init smp_init_package_map(void) } __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + logical_packages = 0; /* * Possibly larger than what we need as the number of apic ids per @@ -352,10 +354,6 @@ static void __init smp_init_package_map(void) memset(physical_to_logical_pkg, 0xff, size); size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); - logical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); for_each_present_cpu(cpu) { unsigned int apicid = apic->cpu_present_to_apicid(cpu); @@ -369,6 +367,15 @@ static void __init smp_init_package_map(void) set_cpu_possible(cpu, false); set_cpu_present(cpu, false); } + + if (logical_packages > __max_logical_packages) { + pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", + logical_packages, __max_logical_packages); + logical_packages_frozen = true; + __max_logical_packages = logical_packages; + } + + pr_info("Max logical packages: %u\n", __max_logical_packages); } void __init smp_store_boot_cpu_info(void) -- cgit v1.2.2 From d048c098218e91ed0e10dfa1f0f80e2567fe4ef7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Mon, 8 Aug 2016 20:16:22 +0200 Subject: KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit msr bitmap can be used to avoid a VM exit (interception) on guest MSR accesses. In some configurations of VMX controls, the guest can even directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED APIC ACCESSES. L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. To do so, L1 would first trick KVM to disable all possible interceptions by enabling APICv features and then would turn those features off; nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would not intercept previously enabled MSRs even though they were not safe with the new configuration. Correctly re-enabling interceptions is not enough as a second bug would still allow L1+L2 to access host's MSRs: msr bitmap was shared for all VMCSs, so L1 could trigger a race to get the desired combination of msr bitmap and VMX controls. This fix allocates a msr bitmap for every L1 VCPU, allows only safe x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would have to intercept everything anyway. Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") Reported-by: Jim Mattson Suggested-by: Wincy Van Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 63 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a45d8580f91e..c66ac2c70d22 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -435,6 +435,8 @@ struct nested_vmx { bool pi_pending; u16 posted_intr_nv; + unsigned long *msr_bitmap; + struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) unsigned long *msr_bitmap; if (is_guest_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_nested; + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_longmode_x2apic) goto out4; - if (nested) { - vmx_msr_bitmap_nested = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_nested) - goto out5; - } - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out6; @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (nested) - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; @@ -6529,9 +6521,6 @@ out8: out7: free_page((unsigned long)vmx_vmread_bitmap); out6: - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); -out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) - return -ENOMEM; + goto out_cached_vmcs12; if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) { - kfree(vmx->nested.cached_vmcs12); - return -ENOMEM; - } + if (!shadow_vmcs) + goto out_shadow_vmcs; /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; } /* @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); + if (vmx->nested.msr_bitmap) { + free_page((unsigned long)vmx->nested.msr_bitmap); + vmx->nested.msr_bitmap = NULL; + } if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); kfree(vmx->nested.cached_vmcs12); @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, { int msr; struct page *page; - unsigned long *msr_bitmap; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, WARN_ON(1); return false; } - msr_bitmap = (unsigned long *)kmap(page); - if (!msr_bitmap) { + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (!msr_bitmap_l1) { nested_release_page_clean(page); WARN_ON(1); return false; } + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) for (msr = 0x800; msr <= 0x8ff; msr++) nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, msr, MSR_TYPE_R); - /* TPR is allowed */ - nested_vmx_disable_intercept_for_msr(msr_bitmap, - vmx_msr_bitmap_nested, + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_TASKPRI >> 4), MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { - /* EOI and self-IPI are allowed */ nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_EOI >> 4), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_SELF_IPI >> 4), MSR_TYPE_W); } - } else { - /* - * Enable reading intercept of all the x2apic - * MSRs. We should not rely on vmcs12 to do any - * optimizations here, it may have been modified - * by L1. - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - msr, - MSR_TYPE_R); - - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); } kunmap(page); nested_release_page_clean(page); @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS) { - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); - /* MSR_BITMAP will be set by following vmx_set_efer. */ - } else + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ + else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; /* -- cgit v1.2.2 From dccbfcf52cebb8963246eba5b177b77f26b34da0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Mon, 8 Aug 2016 20:16:23 +0200 Subject: KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the write with vmcs02 as the current VMCS. This will incorrectly apply modifications intended for vmcs01 to vmcs02 and L2 can use it to gain access to L0's x2APIC registers by disabling virtualized x2APIC while using msr bitmap that assumes enabled. Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the current VMCS. An alternative solution would temporarily make vmcs01 the current VMCS, but it requires more care. Fixes: 8d14695f9542 ("x86, apicv: add virtual x2apic support") Reported-by: Jim Mattson Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c66ac2c70d22..ae111a07acc4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -422,6 +422,7 @@ struct nested_vmx { struct list_head vmcs02_pool; int vmcs02_num; u64 vmcs01_tsc_offset; + bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; /* @@ -8424,6 +8425,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) { u32 sec_exec_control; + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true; + return; + } + /* * There is not point to enable virtualize x2apic without enable * apicv @@ -10749,6 +10756,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); + if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { + vmx->nested.change_vmcs01_virtual_x2apic_mode = false; + vmx_set_virtual_x2apic_mode(vcpu, + vcpu->arch.apic_base & X2APIC_ENABLE); + } + /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; -- cgit v1.2.2 From c95ba92afb238ac565c68968fc72e38ca8d1b6e8 Mon Sep 17 00:00:00 2001 From: Peter Feiner Date: Wed, 17 Aug 2016 09:36:47 -0700 Subject: kvm: nVMX: fix nested tsc scaling When the host supported TSC scaling, L2 would use a TSC multiplier of 0, which causes a VM entry failure. Now L2's TSC uses the same multiplier as L1. Signed-off-by: Peter Feiner Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ae111a07acc4..5cede40e2552 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2200,6 +2200,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); } +static void decache_tsc_multiplier(struct vcpu_vmx *vmx) +{ + vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2258,10 +2264,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Setup TSC multiplier */ if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) { - vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); - } + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) + decache_tsc_multiplier(vmx); vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); @@ -9999,6 +10003,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); else vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); if (enable_vpid) { /* @@ -10755,6 +10761,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, else vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { vmx->nested.change_vmcs01_virtual_x2apic_mode = false; -- cgit v1.2.2 From a93a4d62324123dcda383bdb4ab89151bbc0e499 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 5 Dec 2014 12:34:54 +0000 Subject: arm64: Fix shift warning in arch/arm64/mm/dump.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with 48-bit VAs and 16K page configuration, it's possible to get the following warning when building the arm64 page table dumping code: arch/arm64/mm/dump.c: In function ‘walk_pud’: arch/arm64/mm/dump.c:274:102: warning: right shift count >= width of type [-Wshift-count-overflow] This is because pud_offset(pgd, 0) performs a shift to the right by 36 while the value 0 has the type 'int' by default, therefore 32-bit. This patch modifies all the p*_offset() uses in arch/arm64/mm/dump.c to use 0UL for the address argument. Acked-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/mm/dump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index f94b80eb295d..9c3e75df2180 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) { - pte_t *pte = pte_offset_kernel(pmd, 0); + pte_t *pte = pte_offset_kernel(pmd, 0UL); unsigned long addr; unsigned i; @@ -254,7 +254,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) { - pmd_t *pmd = pmd_offset(pud, 0); + pmd_t *pmd = pmd_offset(pud, 0UL); unsigned long addr; unsigned i; @@ -271,7 +271,7 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0); + pud_t *pud = pud_offset(pgd, 0UL); unsigned long addr; unsigned i; -- cgit v1.2.2 From ba913e4f72fc9cfd03dad968dfb110eb49211d80 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 19 Aug 2016 14:30:29 +0100 Subject: MIPS: KVM: Check for pfn noslot case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mapping a page into the guest we error check using is_error_pfn(), however this doesn't detect a value of KVM_PFN_NOSLOT, indicating an error HVA for the page. This can only happen on MIPS right now due to unusual memslot management (e.g. being moved / removed / resized), or with an Enhanced Virtual Memory (EVA) configuration where the default KVM_HVA_ERR_* and kvm_is_error_hva() definitions are unsuitable (fixed in a later patch). This case will be treated as a pfn of zero, mapping the first page of physical memory into the guest. It would appear the MIPS KVM port wasn't updated prior to being merged (in v3.10) to take commit 81c52c56e2b4 ("KVM: do not treat noslot pfn as a error pfn") into account (merged v3.8), which converted a bunch of is_error_pfn() calls to is_error_noslot_pfn(). Switch to using is_error_noslot_pfn() instead to catch this case properly. Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: # 3.10.y- Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6cfdcf55572d..121008c0fcc9 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -40,7 +40,7 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) srcu_idx = srcu_read_lock(&kvm->srcu); pfn = gfn_to_pfn(kvm, gfn); - if (is_error_pfn(pfn)) { + if (is_error_noslot_pfn(pfn)) { kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); err = -EFAULT; goto out; -- cgit v1.2.2 From 86147e3cfa5e118b61e78f4f0bf29e920dcbd477 Mon Sep 17 00:00:00 2001 From: Liav Rehana Date: Tue, 16 Aug 2016 10:55:35 +0300 Subject: ARC: use correct offset in pt_regs for saving/restoring user mode r25 User mode callee regs are explicitly collected before signal delivery or breakpoint trap. r25 is special for kernel as it serves as task pointer, so user mode value is clobbered very early. It is saved in pt_regs where generally only scratch (aka caller saved) regs are saved. The code to access the corresponding pt_regs location had a subtle bug as it was using load/store with scaling of offset, whereas the offset was already byte wise correct. So fix this by replacing LD.AS with a standard LD Cc: Signed-off-by: Liav Rehana Reviewed-by: Alexey Brodkin [vgupta: rewrote title and commit log] Signed-off-by: Vineet Gupta --- arch/arc/include/asm/entry.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/asm/entry.h b/arch/arc/include/asm/entry.h index ad7860c5ce15..51597f344a62 100644 --- a/arch/arc/include/asm/entry.h +++ b/arch/arc/include/asm/entry.h @@ -142,7 +142,7 @@ #ifdef CONFIG_ARC_CURR_IN_REG ; Retrieve orig r25 and save it with rest of callee_regs - ld.as r12, [r12, PT_user_r25] + ld r12, [r12, PT_user_r25] PUSH r12 #else PUSH r25 @@ -198,7 +198,7 @@ ; SP is back to start of pt_regs #ifdef CONFIG_ARC_CURR_IN_REG - st.as r12, [sp, PT_user_r25] + st r12, [sp, PT_user_r25] #endif .endm -- cgit v1.2.2 From 840c054fd0efb048df6fceb0c46385ec5b66dfe6 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 10 Aug 2016 14:10:57 -0700 Subject: ARC: Support syscall ABI v4 The syscall ABI includes the gcc functional calling ABI since a syscall implies userland caller and kernel callee. The current gcc ABI (v3) for ARCv2 ISA required 64-bit data be passed in even-odd register pairs, (potentially punching reg holes when passing such values as args). This was partly driven by the fact that the double-word LDD/STD instructions in ARCv2 expect the register alignment and thus gcc forcing this avoids extra MOV at the cost of a few unused register (which we have plenty anyways). This however was rejected as part of upstreaming gcc port to HS. So the new ABI v4 doesn't enforce the even-odd reg restriction. Do note that for ARCompact ISA builds v3 and v4 are practically the same in terms of gcc code generation. In terms of change management, we infer the new ABI if gcc 6.x onwards is used for building the kernel. This also needs a stable backport to enable older kernels to work with new tools/user-space Cc: Signed-off-by: Vineet Gupta --- arch/arc/include/uapi/asm/elf.h | 11 +++++++++-- arch/arc/kernel/process.c | 2 +- arch/arc/kernel/setup.c | 6 ++++-- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/uapi/asm/elf.h b/arch/arc/include/uapi/asm/elf.h index 0f99ac8fcbb2..0037a587320d 100644 --- a/arch/arc/include/uapi/asm/elf.h +++ b/arch/arc/include/uapi/asm/elf.h @@ -13,8 +13,15 @@ /* Machine specific ELF Hdr flags */ #define EF_ARC_OSABI_MSK 0x00000f00 -#define EF_ARC_OSABI_ORIG 0x00000000 /* MUST be zero for back-compat */ -#define EF_ARC_OSABI_CURRENT 0x00000300 /* v3 (no legacy syscalls) */ + +#define EF_ARC_OSABI_V3 0x00000300 /* v3 (no legacy syscalls) */ +#define EF_ARC_OSABI_V4 0x00000400 /* v4 (64bit data any reg align) */ + +#if __GNUC__ < 6 +#define EF_ARC_OSABI_CURRENT EF_ARC_OSABI_V3 +#else +#define EF_ARC_OSABI_CURRENT EF_ARC_OSABI_V4 +#endif typedef unsigned long elf_greg_t; typedef unsigned long elf_fpregset_t; diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index b5db9e7fd649..be1972bd2729 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -199,7 +199,7 @@ int elf_check_arch(const struct elf32_hdr *x) } eflags = x->e_flags; - if ((eflags & EF_ARC_OSABI_MSK) < EF_ARC_OSABI_CURRENT) { + if ((eflags & EF_ARC_OSABI_MSK) != EF_ARC_OSABI_CURRENT) { pr_err("ABI mismatch - you need newer toolchain\n"); force_sigsegv(SIGSEGV, current); return 0; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index a946400a86d0..f52a0d0dc462 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -291,8 +291,10 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) cpu->dccm.base_addr, TO_KB(cpu->dccm.sz), cpu->iccm.base_addr, TO_KB(cpu->iccm.sz)); - n += scnprintf(buf + n, len - n, - "OS ABI [v3]\t: no-legacy-syscalls\n"); + n += scnprintf(buf + n, len - n, "OS ABI [v%d]\t: %s\n", + EF_ARC_OSABI_CURRENT >> 8, + EF_ARC_OSABI_CURRENT == EF_ARC_OSABI_V3 ? + "no-legacy-syscalls" : "64-bit data any register aligned"); return buf; } -- cgit v1.2.2 From d77976c414ed7f521b9c79b2a9dde0147a3cf754 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 17 Aug 2016 17:34:46 -0700 Subject: ARC: export kmap | MODPOST 7 modules | ERROR: "kmap" [fs/ext2/ext2.ko] undefined! | ../scripts/Makefile.modpost:91: recipe for target '__modpost' failed Cc: Signed-off-by: Vineet Gupta --- arch/arc/mm/highmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 04f83322c9fd..77ff64a874a1 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -61,6 +61,7 @@ void *kmap(struct page *page) return kmap_high(page); } +EXPORT_SYMBOL(kmap); void *kmap_atomic(struct page *page) { -- cgit v1.2.2 From 1c3c909303924d30145601f47b6c058fdd2cbc2e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 16 Aug 2016 18:27:07 -0700 Subject: ARC: mm: fix build breakage with STRICT_MM_TYPECHECKS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | CC mm/memory.o | In file included from ../mm/memory.c:53:0: | ../include/linux/pfn_t.h: In function ‘pfn_t_pte’: | ../include/linux/pfn_t.h:78:2: error: conversion to non-scalar type requested | return pfn_pte(pfn_t_to_pfn(pfn), pgprot); With STRICT_MM_TYPECHECKS pte_t is a struct and the offending code forces a cast which ends up shifting a struct and hence the gcc warning. Note that in recent past some of the arches (aarch64, s390) made STRICT_MM_TYPECHECKS default, but we don't for ARC as this leads to slightly worse generated code, given ARC ABI definition of returning structs (which pte_t would become) Quoting from ARC ABI... "Results of type struct are returned in a caller-supplied temporary variable whose address is passed in r0. For such functions, the arguments are shifted so that they are passed in r1 and up." So - struct to be returned would be allocated on stack requiring extra code at call sites - callee updates stack memory to facilitate the return (vs. simple MOV into return reg r0) Hence STRICT_MM_TYPECHECKS is not enabled by default for ARC Cc: #4.4+ Signed-off-by: Vineet Gupta --- arch/arc/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 0f92d97432a2..89eeb3720051 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -280,7 +280,7 @@ static inline void pmd_set(pmd_t *pmdp, pte_t *ptep) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) -#define pfn_pte(pfn, prot) (__pte(((pte_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) +#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) /* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/ #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) -- cgit v1.2.2 From c57653dc94d0db7bf63067433ceaa97bdcd0a312 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 19 Aug 2016 13:59:02 -0700 Subject: ARC: export __udivdi3 for modules Some module using div_u64() was failing to link because the libgcc 64-bit divide assist routine was not being exported for modules Reported-by: avinashp@quantenna.com Cc: stable@vger.kernel.org Signed-off-by: Vineet Gupta --- arch/arc/kernel/arcksyms.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arc/kernel/arcksyms.c b/arch/arc/kernel/arcksyms.c index 4d9e77724bed..000dd041ab42 100644 --- a/arch/arc/kernel/arcksyms.c +++ b/arch/arc/kernel/arcksyms.c @@ -28,6 +28,7 @@ extern void __muldf3(void); extern void __divdf3(void); extern void __floatunsidf(void); extern void __floatunsisf(void); +extern void __udivdi3(void); EXPORT_SYMBOL(__ashldi3); EXPORT_SYMBOL(__ashrdi3); @@ -45,6 +46,7 @@ EXPORT_SYMBOL(__muldf3); EXPORT_SYMBOL(__divdf3); EXPORT_SYMBOL(__floatunsidf); EXPORT_SYMBOL(__floatunsisf); +EXPORT_SYMBOL(__udivdi3); /* ARC optimised assembler routines */ EXPORT_SYMBOL(memset); -- cgit v1.2.2 From ae141830b118c3fb5b7eab6fa7c8ab7b7224b0a4 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 19 Aug 2016 22:39:02 +0200 Subject: parisc: Fix automatic selection of cr16 clocksource Commit 54b66800907 (parisc: Add native high-resolution sched_clock() implementation) added support to use the CPU-internal cr16 counters as reliable clocksource with the help of HAVE_UNSTABLE_SCHED_CLOCK. Sadly the commit missed to remove the hack which prevented cr16 to become the default clocksource even on SMP systems. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # 4.7+ --- arch/parisc/kernel/processor.c | 8 -------- arch/parisc/kernel/time.c | 12 ------------ 2 files changed, 20 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 5adc339eb7c8..0c2a94a0f751 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -51,8 +51,6 @@ EXPORT_SYMBOL(_parisc_requires_coherency); DEFINE_PER_CPU(struct cpuinfo_parisc, cpu_data); -extern int update_cr16_clocksource(void); /* from time.c */ - /* ** PARISC CPU driver - claim "device" and initialize CPU data structures. ** @@ -228,12 +226,6 @@ static int processor_probe(struct parisc_device *dev) } #endif - /* If we've registered more than one cpu, - * we'll use the jiffies clocksource since cr16 - * is not synchronized between CPUs. - */ - update_cr16_clocksource(); - return 0; } diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 505cf1ac5af2..4b0b963d52a7 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -221,18 +221,6 @@ static struct clocksource clocksource_cr16 = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -int update_cr16_clocksource(void) -{ - /* since the cr16 cycle counters are not synchronized across CPUs, - we'll check if we should switch to a safe clocksource: */ - if (clocksource_cr16.rating != 0 && num_online_cpus() > 1) { - clocksource_change_rating(&clocksource_cr16, 0); - return 1; - } - - return 0; -} - void __init start_cpu_itimer(void) { unsigned int cpu = smp_processor_id(); -- cgit v1.2.2 From 3eb53b20d7bd1374598cfb1feaa081fcac0e76cd Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 20 Aug 2016 11:51:38 +0200 Subject: parisc: Fix order of EREFUSED define in errno.h When building gccgo in userspace, errno.h gets parsed and the go include file sysinfo.go is generated. Since EREFUSED is defined to the same value as ECONNREFUSED, and ECONNREFUSED is defined later on in errno.h, this leads to go complaining that EREFUSED isn't defined yet. Fix this trivial problem by moving the define of EREFUSED down after ECONNREFUSED in errno.h (and clean up the indenting while touching this line). Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- arch/parisc/include/uapi/asm/errno.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index c0ae62520d15..274d5bc6ecce 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -97,10 +97,10 @@ #define ENOTCONN 235 /* Transport endpoint is not connected */ #define ESHUTDOWN 236 /* Cannot send after transport endpoint shutdown */ #define ETOOMANYREFS 237 /* Too many references: cannot splice */ -#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */ #define ETIMEDOUT 238 /* Connection timed out */ #define ECONNREFUSED 239 /* Connection refused */ -#define EREMOTERELEASE 240 /* Remote peer released connection */ +#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */ +#define EREMOTERELEASE 240 /* Remote peer released connection */ #define EHOSTDOWN 241 /* Host is down */ #define EHOSTUNREACH 242 /* No route to host */ -- cgit v1.2.2 From dad2232844073295c64e9cc2d734a0ade043e0f6 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 17 Aug 2016 18:10:11 +0300 Subject: um: Don't discard .text.exit section Commit e41f501d3912 ("vmlinux.lds: account for destructor sections") added '.text.exit' to EXIT_TEXT which is discarded at link time by default. This breaks compilation of UML: `.text.exit' referenced in section `.fini_array' of /usr/lib/gcc/x86_64-linux-gnu/6/../../../x86_64-linux-gnu/libc.a(sdlerror.o): defined in discarded section `.text.exit' of /usr/lib/gcc/x86_64-linux-gnu/6/../../../x86_64-linux-gnu/libc.a(sdlerror.o) Apparently UML doesn't want to discard exit text, so let's place all EXIT_TEXT sections in .exit.text. Fixes: e41f501d3912 ("vmlinux.lds: account for destructor sections") Reported-by: Stefan Traby Signed-off-by: Andrey Ryabinin Cc: Acked-by: Dmitry Vyukov Signed-off-by: Richard Weinberger --- arch/um/include/asm/common.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index 1dd5bd8a8c59..133055311dce 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -81,7 +81,7 @@ .altinstr_replacement : { *(.altinstr_replacement) } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } + .exit.text : { EXIT_TEXT } .exit.data : { *(.exit.data) } .preinit_array : { -- cgit v1.2.2 From 21c80c9fefc3db10b530a96eb0478c29eb28bf77 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 23 Aug 2016 16:36:42 -0500 Subject: x86/PCI: VMD: Fix infinite loop executing irq's We can't initialize the list head on deletion as this causes the node to point to itself, which causes an infinite loop if vmd_irq() happens to be servicing that node. The list initialization was trying to fix a bug from multiple calls to disable the same IRQ. Fix this instead by having the VMD driver track if the interrupt is enabled. [bhelgaas: changelog, add "Fixes"] Fixes: 97e923063575 ("x86/PCI: VMD: Initialize list item in IRQ disable") Reported-by: Grzegorz Koczot Tested-by: Miroslaw Drost Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Acked-by Jon Derrick: --- arch/x86/pci/vmd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index b814ca675131..7948be342ee9 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -41,6 +41,7 @@ static DEFINE_RAW_SPINLOCK(list_lock); * @node: list item for parent traversal. * @rcu: RCU callback item for freeing. * @irq: back pointer to parent. + * @enabled: true if driver enabled IRQ * @virq: the virtual IRQ value provided to the requesting driver. * * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to @@ -50,6 +51,7 @@ struct vmd_irq { struct list_head node; struct rcu_head rcu; struct vmd_irq_list *irq; + bool enabled; unsigned int virq; }; @@ -122,7 +124,9 @@ static void vmd_irq_enable(struct irq_data *data) unsigned long flags; raw_spin_lock_irqsave(&list_lock, flags); + WARN_ON(vmdirq->enabled); list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); + vmdirq->enabled = true; raw_spin_unlock_irqrestore(&list_lock, flags); data->chip->irq_unmask(data); @@ -136,8 +140,10 @@ static void vmd_irq_disable(struct irq_data *data) data->chip->irq_mask(data); raw_spin_lock_irqsave(&list_lock, flags); - list_del_rcu(&vmdirq->node); - INIT_LIST_HEAD_RCU(&vmdirq->node); + if (vmdirq->enabled) { + list_del_rcu(&vmdirq->node); + vmdirq->enabled = false; + } raw_spin_unlock_irqrestore(&list_lock, flags); } -- cgit v1.2.2 From 2e63ad4bd5dd583871e6602f9d398b9322d358d9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 23 Aug 2016 20:07:19 +0800 Subject: x86/apic: Do not init irq remapping if ioapic is disabled native_smp_prepare_cpus -> default_setup_apic_routing -> enable_IR_x2apic -> irq_remapping_prepare -> intel_prepare_irq_remapping -> intel_setup_irq_remapping So IR table is setup even if "noapic" boot parameter is added. As a result we crash later when the interrupt affinity is set due to a half initialized remapping infrastructure. Prevent remap initialization when IOAPIC is disabled. Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1471954039-3942-1-git-send-email-wanpeng.li@hotmail.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index cea4fc19e844..50c95af0f017 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1623,6 +1623,9 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; + if (skip_ioapic_setup) + return; + ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) return; -- cgit v1.2.2 From 55467dea2967259f21f4f854fc99d39cc5fea60e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 29 Jul 2016 11:06:48 +0200 Subject: xen: change the type of xen_vcpu_id to uint32_t We pass xen_vcpu_id mapping information to hypercalls which require uint32_t type so it would be cleaner to have it as uint32_t. The initializer to -1 can be dropped as we always do the mapping before using it and we never check the 'not set' value anyway. Signed-off-by: Vitaly Kuznetsov Signed-off-by: David Vrabel --- arch/arm/xen/enlighten.c | 2 +- arch/x86/xen/enlighten.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index b0b82f5ea338..3d2cef6488ea 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -50,7 +50,7 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); static struct vcpu_info __percpu *xen_vcpu_info; /* Linux <-> Xen vCPU id mapping */ -DEFINE_PER_CPU(int, xen_vcpu_id) = -1; +DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); /* These are unused until we support booting "pre-ballooned" */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8ffb089b19a5..b86ebb1a9a7f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -118,7 +118,7 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ -DEFINE_PER_CPU(int, xen_vcpu_id) = -1; +DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); enum xen_domain_type xen_domain_type = XEN_NATIVE; -- cgit v1.2.2 From fd363bd417ddb6103564c69cfcbd92d9a7877431 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 24 Aug 2016 18:02:08 +0100 Subject: arm64: avoid TLB conflict with CONFIG_RANDOMIZE_BASE When CONFIG_RANDOMIZE_BASE is selected, we modify the page tables to remap the kernel at a newly-chosen VA range. We do this with the MMU disabled, but do not invalidate TLBs prior to re-enabling the MMU with the new tables. Thus the old mappings entries may still live in TLBs, and we risk violating Break-Before-Make requirements, leading to TLB conflicts and/or other issues. We invalidate TLBs when we uninsall the idmap in early setup code, but prior to this we are subject to issues relating to the Break-Before-Make violation. Avoid these issues by invalidating the TLBs before the new mappings can be used by the hardware. Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Cc: # 4.6+ Acked-by: Ard Biesheuvel Acked-by: Will Deacon Signed-off-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b77f58355da1..3e7b050e99dc 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -757,6 +757,9 @@ ENTRY(__enable_mmu) isb bl __create_page_tables // recreate kernel mapping + tlbi vmalle1 // Remove any stale TLB entries + dsb nsh + msr sctlr_el1, x19 // re-enable the MMU isb ic iallu // flush instructions fetched -- cgit v1.2.2 From a5ff1b34e16c203397542d98c49c5c7783193946 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Aug 2016 15:17:02 -0700 Subject: treewide: replace config_enabled() with IS_ENABLED() (2nd round) Commit 97f2645f358b ("tree-wide: replace config_enabled() with IS_ENABLED()") mostly killed config_enabled(), but some new users have appeared for v4.8-rc1. They are all used for a boolean option, so can be replaced with IS_ENABLED() safely. Link: http://lkml.kernel.org/r/1471970749-24867-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Kees Cook Acked-by: Peter Oberparleiter Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ralf Baechle Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/page.h | 4 ++-- arch/s390/kernel/setup.c | 6 ++---- arch/x86/mm/kaslr.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index ea0cd9773914..5f987598054f 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -164,7 +164,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; */ static inline unsigned long ___pa(unsigned long x) { - if (config_enabled(CONFIG_64BIT)) { + if (IS_ENABLED(CONFIG_64BIT)) { /* * For MIPS64 the virtual address may either be in one of * the compatibility segements ckseg0 or ckseg1, or it may @@ -173,7 +173,7 @@ static inline unsigned long ___pa(unsigned long x) return x < CKSEG0 ? XPHYSADDR(x) : CPHYSADDR(x); } - if (!config_enabled(CONFIG_EVA)) { + if (!IS_ENABLED(CONFIG_EVA)) { /* * We're using the standard MIPS32 legacy memory map, ie. * the address x is going to be in kseg0 or kseg1. We can diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ba5f456edaa9..7f7ba5f23f13 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -204,11 +204,9 @@ static void __init conmode_default(void) #endif } } else if (MACHINE_IS_KVM) { - if (sclp.has_vt220 && - config_enabled(CONFIG_SCLP_VT220_CONSOLE)) + if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) SET_CONSOLE_VT220; - else if (sclp.has_linemode && - config_enabled(CONFIG_SCLP_CONSOLE)) + else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) SET_CONSOLE_SCLP; else SET_CONSOLE_HVC; diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ec8654f117d8..bda8d5eef04d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -77,7 +77,7 @@ static inline unsigned long get_padding(struct kaslr_memory_region *region) */ static inline bool kaslr_memory_enabled(void) { - return kaslr_enabled() && !config_enabled(CONFIG_KASAN); + return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } /* Initialize base and padding for each memory region randomized with KASLR */ -- cgit v1.2.2