diff options
68 files changed, 1638 insertions, 825 deletions
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt new file mode 100644 index 000000000000..607b1a016064 --- /dev/null +++ b/Documentation/x86/earlyprintk.txt | |||
| @@ -0,0 +1,101 @@ | |||
| 1 | |||
| 2 | Mini-HOWTO for using the earlyprintk=dbgp boot option with a | ||
| 3 | USB2 Debug port key and a debug cable, on x86 systems. | ||
| 4 | |||
| 5 | You need two computers, the 'USB debug key' special gadget and | ||
| 6 | and two USB cables, connected like this: | ||
| 7 | |||
| 8 | [host/target] <-------> [USB debug key] <-------> [client/console] | ||
| 9 | |||
| 10 | 1. There are three specific hardware requirements: | ||
| 11 | |||
| 12 | a.) Host/target system needs to have USB debug port capability. | ||
| 13 | |||
| 14 | You can check this capability by looking at a 'Debug port' bit in | ||
| 15 | the lspci -vvv output: | ||
| 16 | |||
| 17 | # lspci -vvv | ||
| 18 | ... | ||
| 19 | 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI]) | ||
| 20 | Subsystem: Lenovo ThinkPad T61 | ||
| 21 | Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- | ||
| 22 | Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx- | ||
| 23 | Latency: 0 | ||
| 24 | Interrupt: pin D routed to IRQ 19 | ||
| 25 | Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K] | ||
| 26 | Capabilities: [50] Power Management version 2 | ||
| 27 | Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+) | ||
| 28 | Status: D0 PME-Enable- DSel=0 DScale=0 PME+ | ||
| 29 | Capabilities: [58] Debug port: BAR=1 offset=00a0 | ||
| 30 | ^^^^^^^^^^^ <==================== [ HERE ] | ||
| 31 | Kernel driver in use: ehci_hcd | ||
| 32 | Kernel modules: ehci-hcd | ||
| 33 | ... | ||
| 34 | |||
| 35 | ( If your system does not list a debug port capability then you probably | ||
| 36 | wont be able to use the USB debug key. ) | ||
| 37 | |||
| 38 | b.) You also need a Netchip USB debug cable/key: | ||
| 39 | |||
| 40 | http://www.plxtech.com/products/NET2000/NET20DC/default.asp | ||
| 41 | |||
| 42 | This is a small blue plastic connector with two USB connections, | ||
| 43 | it draws power from its USB connections. | ||
| 44 | |||
| 45 | c.) Thirdly, you need a second client/console system with a regular USB port. | ||
| 46 | |||
| 47 | 2. Software requirements: | ||
| 48 | |||
| 49 | a.) On the host/target system: | ||
| 50 | |||
| 51 | You need to enable the following kernel config option: | ||
| 52 | |||
| 53 | CONFIG_EARLY_PRINTK_DBGP=y | ||
| 54 | |||
| 55 | And you need to add the boot command line: "earlyprintk=dbgp". | ||
| 56 | (If you are using Grub, append it to the 'kernel' line in | ||
| 57 | /etc/grub.conf) | ||
| 58 | |||
| 59 | NOTE: normally earlyprintk console gets turned off once the | ||
| 60 | regular console is alive - use "earlyprintk=dbgp,keep" to keep | ||
| 61 | this channel open beyond early bootup. This can be useful for | ||
| 62 | debugging crashes under Xorg, etc. | ||
| 63 | |||
| 64 | b.) On the client/console system: | ||
| 65 | |||
| 66 | You should enable the following kernel config option: | ||
| 67 | |||
| 68 | CONFIG_USB_SERIAL_DEBUG=y | ||
| 69 | |||
| 70 | On the next bootup with the modified kernel you should | ||
| 71 | get a /dev/ttyUSBx device(s). | ||
| 72 | |||
| 73 | Now this channel of kernel messages is ready to be used: start | ||
| 74 | your favorite terminal emulator (minicom, etc.) and set | ||
| 75 | it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to | ||
| 76 | see the raw output. | ||
| 77 | |||
| 78 | c.) On Nvidia Southbridge based systems: the kernel will try to probe | ||
| 79 | and find out which port has debug device connected. | ||
| 80 | |||
| 81 | 3. Testing that it works fine: | ||
| 82 | |||
| 83 | You can test the output by using earlyprintk=dbgp,keep and provoking | ||
| 84 | kernel messages on the host/target system. You can provoke a harmless | ||
| 85 | kernel message by for example doing: | ||
| 86 | |||
| 87 | echo h > /proc/sysrq-trigger | ||
| 88 | |||
| 89 | On the host/target system you should see this help line in "dmesg" output: | ||
| 90 | |||
| 91 | SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z) | ||
| 92 | |||
| 93 | On the client/console system do: | ||
| 94 | |||
| 95 | cat /dev/ttyUSB0 | ||
| 96 | |||
| 97 | And you should see the help line above displayed shortly after you've | ||
| 98 | provoked it on the host system. | ||
| 99 | |||
| 100 | If it does not work then please ask about it on the linux-kernel@vger.kernel.org | ||
| 101 | mailing list or contact the x86 maintainers. | ||
| @@ -1,7 +1,7 @@ | |||
| 1 | VERSION = 2 | 1 | VERSION = 2 |
| 2 | PATCHLEVEL = 6 | 2 | PATCHLEVEL = 6 |
| 3 | SUBLEVEL = 29 | 3 | SUBLEVEL = 29 |
| 4 | EXTRAVERSION = -rc6 | 4 | EXTRAVERSION = -rc7 |
| 5 | NAME = Erotic Pickled Herring | 5 | NAME = Erotic Pickled Herring |
| 6 | 6 | ||
| 7 | # *DOCUMENTATION* | 7 | # *DOCUMENTATION* |
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 7049815d66d5..68d6494c0389 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c | |||
| @@ -233,12 +233,13 @@ static void __init cacheid_init(void) | |||
| 233 | unsigned int cachetype = read_cpuid_cachetype(); | 233 | unsigned int cachetype = read_cpuid_cachetype(); |
| 234 | unsigned int arch = cpu_architecture(); | 234 | unsigned int arch = cpu_architecture(); |
| 235 | 235 | ||
| 236 | if (arch >= CPU_ARCH_ARMv7) { | 236 | if (arch >= CPU_ARCH_ARMv6) { |
| 237 | cacheid = CACHEID_VIPT_NONALIASING; | 237 | if ((cachetype & (7 << 29)) == 4 << 29) { |
| 238 | if ((cachetype & (3 << 14)) == 1 << 14) | 238 | /* ARMv7 register format */ |
| 239 | cacheid |= CACHEID_ASID_TAGGED; | 239 | cacheid = CACHEID_VIPT_NONALIASING; |
| 240 | } else if (arch >= CPU_ARCH_ARMv6) { | 240 | if ((cachetype & (3 << 14)) == 1 << 14) |
| 241 | if (cachetype & (1 << 23)) | 241 | cacheid |= CACHEID_ASID_TAGGED; |
| 242 | } else if (cachetype & (1 << 23)) | ||
| 242 | cacheid = CACHEID_VIPT_ALIASING; | 243 | cacheid = CACHEID_VIPT_ALIASING; |
| 243 | else | 244 | else |
| 244 | cacheid = CACHEID_VIPT_NONALIASING; | 245 | cacheid = CACHEID_VIPT_NONALIASING; |
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 9bb4f043aa22..7ac812dc055a 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c | |||
| @@ -332,7 +332,6 @@ static int at91_pm_enter(suspend_state_t state) | |||
| 332 | at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR)); | 332 | at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR)); |
| 333 | 333 | ||
| 334 | error: | 334 | error: |
| 335 | sdram_selfrefresh_disable(); | ||
| 336 | target_state = PM_SUSPEND_ON; | 335 | target_state = PM_SUSPEND_ON; |
| 337 | at91_irq_resume(); | 336 | at91_irq_resume(); |
| 338 | at91_gpio_resume(); | 337 | at91_gpio_resume(); |
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S index 8a7f65ba14b7..94077fbd96b7 100644 --- a/arch/arm/mm/abort-ev6.S +++ b/arch/arm/mm/abort-ev6.S | |||
| @@ -23,7 +23,8 @@ ENTRY(v6_early_abort) | |||
| 23 | #ifdef CONFIG_CPU_32v6K | 23 | #ifdef CONFIG_CPU_32v6K |
| 24 | clrex | 24 | clrex |
| 25 | #else | 25 | #else |
| 26 | strex r0, r1, [sp] @ Clear the exclusive monitor | 26 | sub r1, sp, #4 @ Get unused stack location |
| 27 | strex r0, r1, [r1] @ Clear the exclusive monitor | ||
| 27 | #endif | 28 | #endif |
| 28 | mrc p15, 0, r1, c5, c0, 0 @ get FSR | 29 | mrc p15, 0, r1, c5, c0, 0 @ get FSR |
| 29 | mrc p15, 0, r0, c6, c0, 0 @ get FAR | 30 | mrc p15, 0, r0, c6, c0, 0 @ get FAR |
diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c index 1f7cc0067f5c..ebb305ce7689 100644 --- a/arch/arm/plat-s3c64xx/irq-eint.c +++ b/arch/arm/plat-s3c64xx/irq-eint.c | |||
| @@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq) | |||
| 55 | u32 mask; | 55 | u32 mask; |
| 56 | 56 | ||
| 57 | mask = __raw_readl(S3C64XX_EINT0MASK); | 57 | mask = __raw_readl(S3C64XX_EINT0MASK); |
| 58 | mask |= eint_irq_to_bit(irq); | 58 | mask &= ~eint_irq_to_bit(irq); |
| 59 | __raw_writel(mask, S3C64XX_EINT0MASK); | 59 | __raw_writel(mask, S3C64XX_EINT0MASK); |
| 60 | } | 60 | } |
| 61 | 61 | ||
diff --git a/arch/powerpc/platforms/86xx/gef_sbc610.c b/arch/powerpc/platforms/86xx/gef_sbc610.c index fb371f5ce132..d6b772ba3b8f 100644 --- a/arch/powerpc/platforms/86xx/gef_sbc610.c +++ b/arch/powerpc/platforms/86xx/gef_sbc610.c | |||
| @@ -142,6 +142,10 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev) | |||
| 142 | { | 142 | { |
| 143 | unsigned int val; | 143 | unsigned int val; |
| 144 | 144 | ||
| 145 | /* Do not do the fixup on other platforms! */ | ||
| 146 | if (!machine_is(gef_sbc610)) | ||
| 147 | return; | ||
| 148 | |||
| 145 | printk(KERN_INFO "Running NEC uPD720101 Fixup\n"); | 149 | printk(KERN_INFO "Running NEC uPD720101 Fixup\n"); |
| 146 | 150 | ||
| 147 | /* Ensure ports 1, 2, 3, 4 & 5 are enabled */ | 151 | /* Ensure ports 1, 2, 3, 4 & 5 are enabled */ |
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index c42cd898f68b..6118890c946d 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c | |||
| @@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void) | |||
| 556 | module_init(aes_s390_init); | 556 | module_init(aes_s390_init); |
| 557 | module_exit(aes_s390_fini); | 557 | module_exit(aes_s390_fini); |
| 558 | 558 | ||
| 559 | MODULE_ALIAS("aes"); | 559 | MODULE_ALIAS("aes-all"); |
| 560 | 560 | ||
| 561 | MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); | 561 | MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); |
| 562 | MODULE_LICENSE("GPL"); | 562 | MODULE_LICENSE("GPL"); |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f5cef3fbf9a5..31758378bcd2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -783,6 +783,11 @@ config X86_MCE_AMD | |||
| 783 | Additional support for AMD specific MCE features such as | 783 | Additional support for AMD specific MCE features such as |
| 784 | the DRAM Error Threshold. | 784 | the DRAM Error Threshold. |
| 785 | 785 | ||
| 786 | config X86_MCE_THRESHOLD | ||
| 787 | depends on X86_MCE_AMD || X86_MCE_INTEL | ||
| 788 | bool | ||
| 789 | default y | ||
| 790 | |||
| 786 | config X86_MCE_NONFATAL | 791 | config X86_MCE_NONFATAL |
| 787 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" | 792 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" |
| 788 | depends on X86_32 && X86_MCE | 793 | depends on X86_32 && X86_MCE |
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b9..bc9514fb3b13 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h | |||
| @@ -53,6 +53,7 @@ | |||
| 53 | #define APIC_ESR_SENDILL 0x00020 | 53 | #define APIC_ESR_SENDILL 0x00020 |
| 54 | #define APIC_ESR_RECVILL 0x00040 | 54 | #define APIC_ESR_RECVILL 0x00040 |
| 55 | #define APIC_ESR_ILLREGA 0x00080 | 55 | #define APIC_ESR_ILLREGA 0x00080 |
| 56 | #define APIC_LVTCMCI 0x2f0 | ||
| 56 | #define APIC_ICR 0x300 | 57 | #define APIC_ICR 0x300 |
| 57 | #define APIC_DEST_SELF 0x40000 | 58 | #define APIC_DEST_SELF 0x40000 |
| 58 | #define APIC_DEST_ALLINC 0x80000 | 59 | #define APIC_DEST_ALLINC 0x80000 |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ca5ffb2856b6..edc90f23e708 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
| @@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); | |||
| 37 | 37 | ||
| 38 | #else /* !CONFIG_X86_32 */ | 38 | #else /* !CONFIG_X86_32 */ |
| 39 | 39 | ||
| 40 | #define MAX_EFI_IO_PAGES 100 | ||
| 41 | |||
| 42 | extern u64 efi_call0(void *fp); | 40 | extern u64 efi_call0(void *fp); |
| 43 | extern u64 efi_call1(void *fp, u64 arg1); | 41 | extern u64 efi_call1(void *fp, u64 arg1); |
| 44 | extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); | 42 | extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index dca8f03da5b2..63a79c77d220 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
| @@ -24,9 +24,6 @@ | |||
| 24 | #include <asm/kmap_types.h> | 24 | #include <asm/kmap_types.h> |
| 25 | #else | 25 | #else |
| 26 | #include <asm/vsyscall.h> | 26 | #include <asm/vsyscall.h> |
| 27 | #ifdef CONFIG_EFI | ||
| 28 | #include <asm/efi.h> | ||
| 29 | #endif | ||
| 30 | #endif | 27 | #endif |
| 31 | 28 | ||
| 32 | /* | 29 | /* |
| @@ -92,13 +89,6 @@ enum fixed_addresses { | |||
| 92 | FIX_IO_APIC_BASE_0, | 89 | FIX_IO_APIC_BASE_0, |
| 93 | FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, | 90 | FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, |
| 94 | #endif | 91 | #endif |
| 95 | #ifdef CONFIG_X86_64 | ||
| 96 | #ifdef CONFIG_EFI | ||
| 97 | FIX_EFI_IO_MAP_LAST_PAGE, | ||
| 98 | FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE | ||
| 99 | + MAX_EFI_IO_PAGES - 1, | ||
| 100 | #endif | ||
| 101 | #endif | ||
| 102 | #ifdef CONFIG_X86_VISWS_APIC | 92 | #ifdef CONFIG_X86_VISWS_APIC |
| 103 | FIX_CO_CPU, /* Cobalt timer */ | 93 | FIX_CO_CPU, /* Cobalt timer */ |
| 104 | FIX_CO_APIC, /* Cobalt APIC Redirection Table */ | 94 | FIX_CO_APIC, /* Cobalt APIC Redirection Table */ |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 48f0004db8c9..71c9e5183982 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
| @@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk) | |||
| 172 | 172 | ||
| 173 | #else /* CONFIG_X86_32 */ | 173 | #else /* CONFIG_X86_32 */ |
| 174 | 174 | ||
| 175 | extern void finit(void); | 175 | #ifdef CONFIG_MATH_EMULATION |
| 176 | extern void finit_task(struct task_struct *tsk); | ||
| 177 | #else | ||
| 178 | static inline void finit_task(struct task_struct *tsk) | ||
| 179 | { | ||
| 180 | } | ||
| 181 | #endif | ||
| 176 | 182 | ||
| 177 | static inline void tolerant_fwait(void) | 183 | static inline void tolerant_fwait(void) |
| 178 | { | 184 | { |
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h new file mode 100644 index 000000000000..36fb1a6a5109 --- /dev/null +++ b/arch/x86/include/asm/init.h | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | #ifndef _ASM_X86_INIT_32_H | ||
| 2 | #define _ASM_X86_INIT_32_H | ||
| 3 | |||
| 4 | #ifdef CONFIG_X86_32 | ||
| 5 | extern void __init early_ioremap_page_table_range_init(void); | ||
| 6 | #endif | ||
| 7 | |||
| 8 | extern unsigned long __init | ||
| 9 | kernel_physical_mapping_init(unsigned long start, | ||
| 10 | unsigned long end, | ||
| 11 | unsigned long page_size_mask); | ||
| 12 | |||
| 13 | |||
| 14 | extern unsigned long __initdata e820_table_start; | ||
| 15 | extern unsigned long __meminitdata e820_table_end; | ||
| 16 | extern unsigned long __meminitdata e820_table_top; | ||
| 17 | |||
| 18 | #endif /* _ASM_X86_INIT_32_H */ | ||
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 32c6e17b960b..563933e06a35 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
| @@ -11,6 +11,8 @@ | |||
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ | 13 | #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ |
| 14 | #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ | ||
| 15 | #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ | ||
| 14 | 16 | ||
| 15 | #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ | 17 | #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ |
| 16 | #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ | 18 | #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ |
| @@ -90,14 +92,29 @@ extern int mce_disabled; | |||
| 90 | 92 | ||
| 91 | #include <asm/atomic.h> | 93 | #include <asm/atomic.h> |
| 92 | 94 | ||
| 95 | void mce_setup(struct mce *m); | ||
| 93 | void mce_log(struct mce *m); | 96 | void mce_log(struct mce *m); |
| 94 | DECLARE_PER_CPU(struct sys_device, device_mce); | 97 | DECLARE_PER_CPU(struct sys_device, device_mce); |
| 95 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | 98 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); |
| 96 | 99 | ||
| 100 | /* | ||
| 101 | * To support more than 128 would need to escape the predefined | ||
| 102 | * Linux defined extended banks first. | ||
| 103 | */ | ||
| 104 | #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) | ||
| 105 | |||
| 97 | #ifdef CONFIG_X86_MCE_INTEL | 106 | #ifdef CONFIG_X86_MCE_INTEL |
| 98 | void mce_intel_feature_init(struct cpuinfo_x86 *c); | 107 | void mce_intel_feature_init(struct cpuinfo_x86 *c); |
| 108 | void cmci_clear(void); | ||
| 109 | void cmci_reenable(void); | ||
| 110 | void cmci_rediscover(int dying); | ||
| 111 | void cmci_recheck(void); | ||
| 99 | #else | 112 | #else |
| 100 | static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } | 113 | static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } |
| 114 | static inline void cmci_clear(void) {} | ||
| 115 | static inline void cmci_reenable(void) {} | ||
| 116 | static inline void cmci_rediscover(int dying) {} | ||
| 117 | static inline void cmci_recheck(void) {} | ||
| 101 | #endif | 118 | #endif |
| 102 | 119 | ||
| 103 | #ifdef CONFIG_X86_MCE_AMD | 120 | #ifdef CONFIG_X86_MCE_AMD |
| @@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); | |||
| 106 | static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } | 123 | static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } |
| 107 | #endif | 124 | #endif |
| 108 | 125 | ||
| 109 | void mce_log_therm_throt_event(unsigned int cpu, __u64 status); | 126 | extern int mce_available(struct cpuinfo_x86 *c); |
| 127 | |||
| 128 | void mce_log_therm_throt_event(__u64 status); | ||
| 110 | 129 | ||
| 111 | extern atomic_t mce_entry; | 130 | extern atomic_t mce_entry; |
| 112 | 131 | ||
| 113 | extern void do_machine_check(struct pt_regs *, long); | 132 | extern void do_machine_check(struct pt_regs *, long); |
| 133 | |||
| 134 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); | ||
| 135 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); | ||
| 136 | |||
| 137 | enum mcp_flags { | ||
| 138 | MCP_TIMESTAMP = (1 << 0), /* log time stamp */ | ||
| 139 | MCP_UC = (1 << 1), /* log uncorrected errors */ | ||
| 140 | }; | ||
| 141 | extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); | ||
| 142 | |||
| 114 | extern int mce_notify_user(void); | 143 | extern int mce_notify_user(void); |
| 115 | 144 | ||
| 116 | #endif /* !CONFIG_X86_32 */ | 145 | #endif /* !CONFIG_X86_32 */ |
| @@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c); | |||
| 120 | #else | 149 | #else |
| 121 | #define mcheck_init(c) do { } while (0) | 150 | #define mcheck_init(c) do { } while (0) |
| 122 | #endif | 151 | #endif |
| 123 | extern void stop_mce(void); | 152 | |
| 124 | extern void restart_mce(void); | 153 | extern void (*mce_threshold_vector)(void); |
| 125 | 154 | ||
| 126 | #endif /* __KERNEL__ */ | 155 | #endif /* __KERNEL__ */ |
| 127 | #endif /* _ASM_X86_MCE_H */ | 156 | #endif /* _ASM_X86_MCE_H */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae04..2dbd2314139e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
| @@ -77,6 +77,11 @@ | |||
| 77 | #define MSR_IA32_MC0_ADDR 0x00000402 | 77 | #define MSR_IA32_MC0_ADDR 0x00000402 |
| 78 | #define MSR_IA32_MC0_MISC 0x00000403 | 78 | #define MSR_IA32_MC0_MISC 0x00000403 |
| 79 | 79 | ||
| 80 | /* These are consecutive and not in the normal 4er MCE bank block */ | ||
| 81 | #define MSR_IA32_MC0_CTL2 0x00000280 | ||
| 82 | #define CMCI_EN (1ULL << 30) | ||
| 83 | #define CMCI_THRESHOLD_MASK 0xffffULL | ||
| 84 | |||
| 80 | #define MSR_P6_PERFCTR0 0x000000c1 | 85 | #define MSR_P6_PERFCTR0 0x000000c1 |
| 81 | #define MSR_P6_PERFCTR1 0x000000c2 | 86 | #define MSR_P6_PERFCTR1 0x000000c2 |
| 82 | #define MSR_P6_EVNTSEL0 0x00000186 | 87 | #define MSR_P6_EVNTSEL0 0x00000186 |
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 2d625da6603c..826ad37006ab 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
| @@ -40,14 +40,8 @@ | |||
| 40 | 40 | ||
| 41 | #ifndef __ASSEMBLY__ | 41 | #ifndef __ASSEMBLY__ |
| 42 | 42 | ||
| 43 | struct pgprot; | ||
| 44 | |||
| 45 | extern int page_is_ram(unsigned long pagenr); | 43 | extern int page_is_ram(unsigned long pagenr); |
| 46 | extern int devmem_is_allowed(unsigned long pagenr); | 44 | extern int devmem_is_allowed(unsigned long pagenr); |
| 47 | extern void map_devmem(unsigned long pfn, unsigned long size, | ||
| 48 | struct pgprot vma_prot); | ||
| 49 | extern void unmap_devmem(unsigned long pfn, unsigned long size, | ||
| 50 | struct pgprot vma_prot); | ||
| 51 | 45 | ||
| 52 | extern unsigned long max_low_pfn_mapped; | 46 | extern unsigned long max_low_pfn_mapped; |
| 53 | extern unsigned long max_pfn_mapped; | 47 | extern unsigned long max_pfn_mapped; |
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index b0e70056838e..2cd07b9422f4 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #define _ASM_X86_PAT_H | 2 | #define _ASM_X86_PAT_H |
| 3 | 3 | ||
| 4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
| 5 | #include <asm/pgtable_types.h> | ||
| 5 | 6 | ||
| 6 | #ifdef CONFIG_X86_PAT | 7 | #ifdef CONFIG_X86_PAT |
| 7 | extern int pat_enabled; | 8 | extern int pat_enabled; |
| @@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end); | |||
| 17 | 18 | ||
| 18 | extern int kernel_map_sync_memtype(u64 base, unsigned long size, | 19 | extern int kernel_map_sync_memtype(u64 base, unsigned long size, |
| 19 | unsigned long flag); | 20 | unsigned long flag); |
| 21 | extern void map_devmem(unsigned long pfn, unsigned long size, | ||
| 22 | struct pgprot vma_prot); | ||
| 23 | extern void unmap_devmem(unsigned long pfn, unsigned long size, | ||
| 24 | struct pgprot vma_prot); | ||
| 20 | 25 | ||
| 21 | #endif /* _ASM_X86_PAT_H */ | 26 | #endif /* _ASM_X86_PAT_H */ |
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index bd8df3b2fe04..2733fad45f98 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h | |||
| @@ -25,6 +25,11 @@ | |||
| 25 | * area for the same reason. ;) | 25 | * area for the same reason. ;) |
| 26 | */ | 26 | */ |
| 27 | #define VMALLOC_OFFSET (8 * 1024 * 1024) | 27 | #define VMALLOC_OFFSET (8 * 1024 * 1024) |
| 28 | |||
| 29 | #ifndef __ASSEMBLER__ | ||
| 30 | extern bool __vmalloc_start_set; /* set once high_memory is set */ | ||
| 31 | #endif | ||
| 32 | |||
| 28 | #define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) | 33 | #define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) |
| 29 | #ifdef CONFIG_X86_PAE | 34 | #ifdef CONFIG_X86_PAE |
| 30 | #define LAST_PKMAP 512 | 35 | #define LAST_PKMAP 512 |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4d258ad76a0f..b8238dc8786d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
| @@ -273,6 +273,7 @@ typedef struct page *pgtable_t; | |||
| 273 | 273 | ||
| 274 | extern pteval_t __supported_pte_mask; | 274 | extern pteval_t __supported_pte_mask; |
| 275 | extern int nx_enabled; | 275 | extern int nx_enabled; |
| 276 | extern void set_nx(void); | ||
| 276 | 277 | ||
| 277 | #define pgprot_writecombine pgprot_writecombine | 278 | #define pgprot_writecombine pgprot_writecombine |
| 278 | extern pgprot_t pgprot_writecombine(pgprot_t prot); | 279 | extern pgprot_t pgprot_writecombine(pgprot_t prot); |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6907b8e85d52..4c80f1557433 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
| @@ -414,9 +414,17 @@ void __init alternative_instructions(void) | |||
| 414 | that might execute the to be patched code. | 414 | that might execute the to be patched code. |
| 415 | Other CPUs are not running. */ | 415 | Other CPUs are not running. */ |
| 416 | stop_nmi(); | 416 | stop_nmi(); |
| 417 | #ifdef CONFIG_X86_MCE | 417 | |
| 418 | stop_mce(); | 418 | /* |
| 419 | #endif | 419 | * Don't stop machine check exceptions while patching. |
| 420 | * MCEs only happen when something got corrupted and in this | ||
| 421 | * case we must do something about the corruption. | ||
| 422 | * Ignoring it is worse than a unlikely patching race. | ||
| 423 | * Also machine checks tend to be broadcast and if one CPU | ||
| 424 | * goes into machine check the others follow quickly, so we don't | ||
| 425 | * expect a machine check to cause undue problems during to code | ||
| 426 | * patching. | ||
| 427 | */ | ||
| 420 | 428 | ||
| 421 | apply_alternatives(__alt_instructions, __alt_instructions_end); | 429 | apply_alternatives(__alt_instructions, __alt_instructions_end); |
| 422 | 430 | ||
| @@ -456,9 +464,6 @@ void __init alternative_instructions(void) | |||
| 456 | (unsigned long)__smp_locks_end); | 464 | (unsigned long)__smp_locks_end); |
| 457 | 465 | ||
| 458 | restart_nmi(); | 466 | restart_nmi(); |
| 459 | #ifdef CONFIG_X86_MCE | ||
| 460 | restart_mce(); | ||
| 461 | #endif | ||
| 462 | } | 467 | } |
| 463 | 468 | ||
| 464 | /** | 469 | /** |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f9cecdfd05c5..30909a258d0f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <asm/idle.h> | 46 | #include <asm/idle.h> |
| 47 | #include <asm/mtrr.h> | 47 | #include <asm/mtrr.h> |
| 48 | #include <asm/smp.h> | 48 | #include <asm/smp.h> |
| 49 | #include <asm/mce.h> | ||
| 49 | 50 | ||
| 50 | unsigned int num_processors; | 51 | unsigned int num_processors; |
| 51 | 52 | ||
| @@ -842,6 +843,14 @@ void clear_local_APIC(void) | |||
| 842 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | 843 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); |
| 843 | } | 844 | } |
| 844 | #endif | 845 | #endif |
| 846 | #ifdef CONFIG_X86_MCE_INTEL | ||
| 847 | if (maxlvt >= 6) { | ||
| 848 | v = apic_read(APIC_LVTCMCI); | ||
| 849 | if (!(v & APIC_LVT_MASKED)) | ||
| 850 | apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); | ||
| 851 | } | ||
| 852 | #endif | ||
| 853 | |||
| 845 | /* | 854 | /* |
| 846 | * Clean APIC state for other OSs: | 855 | * Clean APIC state for other OSs: |
| 847 | */ | 856 | */ |
| @@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void) | |||
| 1241 | apic_write(APIC_LVT1, value); | 1250 | apic_write(APIC_LVT1, value); |
| 1242 | 1251 | ||
| 1243 | preempt_enable(); | 1252 | preempt_enable(); |
| 1253 | |||
| 1254 | #ifdef CONFIG_X86_MCE_INTEL | ||
| 1255 | /* Recheck CMCI information after local APIC is up on CPU #0 */ | ||
| 1256 | if (smp_processor_id() == 0) | ||
| 1257 | cmci_recheck(); | ||
| 1258 | #endif | ||
| 1244 | } | 1259 | } |
| 1245 | 1260 | ||
| 1246 | void __cpuinit end_local_APIC_setup(void) | 1261 | void __cpuinit end_local_APIC_setup(void) |
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb69..b2f89829bbe8 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
| @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o | |||
| 4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o | 4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o |
| 5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o | 5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o |
| 6 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o | 6 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o |
| 7 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633e..3552119b091d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c | |||
| @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) | |||
| 60 | } | 60 | } |
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | static unsigned long old_cr4 __initdata; | ||
| 64 | |||
| 65 | void __init stop_mce(void) | ||
| 66 | { | ||
| 67 | old_cr4 = read_cr4(); | ||
| 68 | clear_in_cr4(X86_CR4_MCE); | ||
| 69 | } | ||
| 70 | |||
| 71 | void __init restart_mce(void) | ||
| 72 | { | ||
| 73 | if (old_cr4 & X86_CR4_MCE) | ||
| 74 | set_in_cr4(X86_CR4_MCE); | ||
| 75 | } | ||
| 76 | |||
| 77 | static int __init mcheck_disable(char *str) | 63 | static int __init mcheck_disable(char *str) |
| 78 | { | 64 | { |
| 79 | mce_disabled = 1; | 65 | mce_disabled = 1; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fe79985ce0f2..bfbd5323a635 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c | |||
| @@ -3,6 +3,8 @@ | |||
| 3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. | 3 | * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. |
| 4 | * Rest from unknown author(s). | 4 | * Rest from unknown author(s). |
| 5 | * 2004 Andi Kleen. Rewrote most of it. | 5 | * 2004 Andi Kleen. Rewrote most of it. |
| 6 | * Copyright 2008 Intel Corporation | ||
| 7 | * Author: Andi Kleen | ||
| 6 | */ | 8 | */ |
| 7 | 9 | ||
| 8 | #include <linux/init.h> | 10 | #include <linux/init.h> |
| @@ -24,6 +26,9 @@ | |||
| 24 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
| 25 | #include <linux/kmod.h> | 27 | #include <linux/kmod.h> |
| 26 | #include <linux/kdebug.h> | 28 | #include <linux/kdebug.h> |
| 29 | #include <linux/kobject.h> | ||
| 30 | #include <linux/sysfs.h> | ||
| 31 | #include <linux/ratelimit.h> | ||
| 27 | #include <asm/processor.h> | 32 | #include <asm/processor.h> |
| 28 | #include <asm/msr.h> | 33 | #include <asm/msr.h> |
| 29 | #include <asm/mce.h> | 34 | #include <asm/mce.h> |
| @@ -32,7 +37,6 @@ | |||
| 32 | #include <asm/idle.h> | 37 | #include <asm/idle.h> |
| 33 | 38 | ||
| 34 | #define MISC_MCELOG_MINOR 227 | 39 | #define MISC_MCELOG_MINOR 227 |
| 35 | #define NR_SYSFS_BANKS 6 | ||
| 36 | 40 | ||
| 37 | atomic_t mce_entry; | 41 | atomic_t mce_entry; |
| 38 | 42 | ||
| @@ -47,7 +51,7 @@ static int mce_dont_init; | |||
| 47 | */ | 51 | */ |
| 48 | static int tolerant = 1; | 52 | static int tolerant = 1; |
| 49 | static int banks; | 53 | static int banks; |
| 50 | static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; | 54 | static u64 *bank; |
| 51 | static unsigned long notify_user; | 55 | static unsigned long notify_user; |
| 52 | static int rip_msr; | 56 | static int rip_msr; |
| 53 | static int mce_bootlog = -1; | 57 | static int mce_bootlog = -1; |
| @@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL }; | |||
| 58 | 62 | ||
| 59 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | 63 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); |
| 60 | 64 | ||
| 65 | /* MCA banks polled by the period polling timer for corrected events */ | ||
| 66 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | ||
| 67 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | ||
| 68 | }; | ||
| 69 | |||
| 70 | /* Do initial initialization of a struct mce */ | ||
| 71 | void mce_setup(struct mce *m) | ||
| 72 | { | ||
| 73 | memset(m, 0, sizeof(struct mce)); | ||
| 74 | m->cpu = smp_processor_id(); | ||
| 75 | rdtscll(m->tsc); | ||
| 76 | } | ||
| 77 | |||
| 61 | /* | 78 | /* |
| 62 | * Lockless MCE logging infrastructure. | 79 | * Lockless MCE logging infrastructure. |
| 63 | * This avoids deadlocks on printk locks without having to break locks. Also | 80 | * This avoids deadlocks on printk locks without having to break locks. Also |
| @@ -119,11 +136,11 @@ static void print_mce(struct mce *m) | |||
| 119 | print_symbol("{%s}", m->ip); | 136 | print_symbol("{%s}", m->ip); |
| 120 | printk("\n"); | 137 | printk("\n"); |
| 121 | } | 138 | } |
| 122 | printk(KERN_EMERG "TSC %Lx ", m->tsc); | 139 | printk(KERN_EMERG "TSC %llx ", m->tsc); |
| 123 | if (m->addr) | 140 | if (m->addr) |
| 124 | printk("ADDR %Lx ", m->addr); | 141 | printk("ADDR %llx ", m->addr); |
| 125 | if (m->misc) | 142 | if (m->misc) |
| 126 | printk("MISC %Lx ", m->misc); | 143 | printk("MISC %llx ", m->misc); |
| 127 | printk("\n"); | 144 | printk("\n"); |
| 128 | printk(KERN_EMERG "This is not a software problem!\n"); | 145 | printk(KERN_EMERG "This is not a software problem!\n"); |
| 129 | printk(KERN_EMERG "Run through mcelog --ascii to decode " | 146 | printk(KERN_EMERG "Run through mcelog --ascii to decode " |
| @@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) | |||
| 149 | panic(msg); | 166 | panic(msg); |
| 150 | } | 167 | } |
| 151 | 168 | ||
| 152 | static int mce_available(struct cpuinfo_x86 *c) | 169 | int mce_available(struct cpuinfo_x86 *c) |
| 153 | { | 170 | { |
| 171 | if (mce_dont_init) | ||
| 172 | return 0; | ||
| 154 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | 173 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); |
| 155 | } | 174 | } |
| 156 | 175 | ||
| @@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | |||
| 172 | } | 191 | } |
| 173 | 192 | ||
| 174 | /* | 193 | /* |
| 175 | * The actual machine check handler | 194 | * Poll for corrected events or events that happened before reset. |
| 195 | * Those are just logged through /dev/mcelog. | ||
| 196 | * | ||
| 197 | * This is executed in standard interrupt context. | ||
| 198 | */ | ||
| 199 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | ||
| 200 | { | ||
| 201 | struct mce m; | ||
| 202 | int i; | ||
| 203 | |||
| 204 | mce_setup(&m); | ||
| 205 | |||
| 206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | ||
| 207 | for (i = 0; i < banks; i++) { | ||
| 208 | if (!bank[i] || !test_bit(i, *b)) | ||
| 209 | continue; | ||
| 210 | |||
| 211 | m.misc = 0; | ||
| 212 | m.addr = 0; | ||
| 213 | m.bank = i; | ||
| 214 | m.tsc = 0; | ||
| 215 | |||
| 216 | barrier(); | ||
| 217 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | ||
| 218 | if (!(m.status & MCI_STATUS_VAL)) | ||
| 219 | continue; | ||
| 220 | |||
| 221 | /* | ||
| 222 | * Uncorrected events are handled by the exception handler | ||
| 223 | * when it is enabled. But when the exception is disabled log | ||
| 224 | * everything. | ||
| 225 | * | ||
| 226 | * TBD do the same check for MCI_STATUS_EN here? | ||
| 227 | */ | ||
| 228 | if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) | ||
| 229 | continue; | ||
| 230 | |||
| 231 | if (m.status & MCI_STATUS_MISCV) | ||
| 232 | rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); | ||
| 233 | if (m.status & MCI_STATUS_ADDRV) | ||
| 234 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | ||
| 235 | |||
| 236 | if (!(flags & MCP_TIMESTAMP)) | ||
| 237 | m.tsc = 0; | ||
| 238 | /* | ||
| 239 | * Don't get the IP here because it's unlikely to | ||
| 240 | * have anything to do with the actual error location. | ||
| 241 | */ | ||
| 242 | |||
| 243 | mce_log(&m); | ||
| 244 | add_taint(TAINT_MACHINE_CHECK); | ||
| 245 | |||
| 246 | /* | ||
| 247 | * Clear state for this bank. | ||
| 248 | */ | ||
| 249 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
| 250 | } | ||
| 251 | |||
| 252 | /* | ||
| 253 | * Don't clear MCG_STATUS here because it's only defined for | ||
| 254 | * exceptions. | ||
| 255 | */ | ||
| 256 | } | ||
| 257 | |||
| 258 | /* | ||
| 259 | * The actual machine check handler. This only handles real | ||
| 260 | * exceptions when something got corrupted coming in through int 18. | ||
| 261 | * | ||
| 262 | * This is executed in NMI context not subject to normal locking rules. This | ||
| 263 | * implies that most kernel services cannot be safely used. Don't even | ||
| 264 | * think about putting a printk in there! | ||
| 176 | */ | 265 | */ |
| 177 | void do_machine_check(struct pt_regs * regs, long error_code) | 266 | void do_machine_check(struct pt_regs * regs, long error_code) |
| 178 | { | 267 | { |
| @@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 190 | * error. | 279 | * error. |
| 191 | */ | 280 | */ |
| 192 | int kill_it = 0; | 281 | int kill_it = 0; |
| 282 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | ||
| 193 | 283 | ||
| 194 | atomic_inc(&mce_entry); | 284 | atomic_inc(&mce_entry); |
| 195 | 285 | ||
| 196 | if ((regs | 286 | if (notify_die(DIE_NMI, "machine check", regs, error_code, |
| 197 | && notify_die(DIE_NMI, "machine check", regs, error_code, | ||
| 198 | 18, SIGKILL) == NOTIFY_STOP) | 287 | 18, SIGKILL) == NOTIFY_STOP) |
| 199 | || !banks) | 288 | goto out2; |
| 289 | if (!banks) | ||
| 200 | goto out2; | 290 | goto out2; |
| 201 | 291 | ||
| 202 | memset(&m, 0, sizeof(struct mce)); | 292 | mce_setup(&m); |
| 203 | m.cpu = smp_processor_id(); | 293 | |
| 204 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | 294 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); |
| 205 | /* if the restart IP is not valid, we're done for */ | 295 | /* if the restart IP is not valid, we're done for */ |
| 206 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 296 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
| @@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 210 | barrier(); | 300 | barrier(); |
| 211 | 301 | ||
| 212 | for (i = 0; i < banks; i++) { | 302 | for (i = 0; i < banks; i++) { |
| 213 | if (i < NR_SYSFS_BANKS && !bank[i]) | 303 | __clear_bit(i, toclear); |
| 304 | if (!bank[i]) | ||
| 214 | continue; | 305 | continue; |
| 215 | 306 | ||
| 216 | m.misc = 0; | 307 | m.misc = 0; |
| 217 | m.addr = 0; | 308 | m.addr = 0; |
| 218 | m.bank = i; | 309 | m.bank = i; |
| 219 | m.tsc = 0; | ||
| 220 | 310 | ||
| 221 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); | 311 | rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); |
| 222 | if ((m.status & MCI_STATUS_VAL) == 0) | 312 | if ((m.status & MCI_STATUS_VAL) == 0) |
| 223 | continue; | 313 | continue; |
| 224 | 314 | ||
| 315 | /* | ||
| 316 | * Non uncorrected errors are handled by machine_check_poll | ||
| 317 | * Leave them alone. | ||
| 318 | */ | ||
| 319 | if ((m.status & MCI_STATUS_UC) == 0) | ||
| 320 | continue; | ||
| 321 | |||
| 322 | /* | ||
| 323 | * Set taint even when machine check was not enabled. | ||
| 324 | */ | ||
| 325 | add_taint(TAINT_MACHINE_CHECK); | ||
| 326 | |||
| 327 | __set_bit(i, toclear); | ||
| 328 | |||
| 225 | if (m.status & MCI_STATUS_EN) { | 329 | if (m.status & MCI_STATUS_EN) { |
| 226 | /* if PCC was set, there's no way out */ | 330 | /* if PCC was set, there's no way out */ |
| 227 | no_way_out |= !!(m.status & MCI_STATUS_PCC); | 331 | no_way_out |= !!(m.status & MCI_STATUS_PCC); |
| @@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 235 | no_way_out = 1; | 339 | no_way_out = 1; |
| 236 | kill_it = 1; | 340 | kill_it = 1; |
| 237 | } | 341 | } |
| 342 | } else { | ||
| 343 | /* | ||
| 344 | * Machine check event was not enabled. Clear, but | ||
| 345 | * ignore. | ||
| 346 | */ | ||
| 347 | continue; | ||
| 238 | } | 348 | } |
| 239 | 349 | ||
| 240 | if (m.status & MCI_STATUS_MISCV) | 350 | if (m.status & MCI_STATUS_MISCV) |
| @@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 243 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); | 353 | rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); |
| 244 | 354 | ||
| 245 | mce_get_rip(&m, regs); | 355 | mce_get_rip(&m, regs); |
| 246 | if (error_code >= 0) | 356 | mce_log(&m); |
| 247 | rdtscll(m.tsc); | ||
| 248 | if (error_code != -2) | ||
| 249 | mce_log(&m); | ||
| 250 | 357 | ||
| 251 | /* Did this bank cause the exception? */ | 358 | /* Did this bank cause the exception? */ |
| 252 | /* Assume that the bank with uncorrectable errors did it, | 359 | /* Assume that the bank with uncorrectable errors did it, |
| @@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 255 | panicm = m; | 362 | panicm = m; |
| 256 | panicm_found = 1; | 363 | panicm_found = 1; |
| 257 | } | 364 | } |
| 258 | |||
| 259 | add_taint(TAINT_MACHINE_CHECK); | ||
| 260 | } | 365 | } |
| 261 | 366 | ||
| 262 | /* Never do anything final in the polling timer */ | ||
| 263 | if (!regs) | ||
| 264 | goto out; | ||
| 265 | |||
| 266 | /* If we didn't find an uncorrectable error, pick | 367 | /* If we didn't find an uncorrectable error, pick |
| 267 | the last one (shouldn't happen, just being safe). */ | 368 | the last one (shouldn't happen, just being safe). */ |
| 268 | if (!panicm_found) | 369 | if (!panicm_found) |
| @@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 309 | /* notify userspace ASAP */ | 410 | /* notify userspace ASAP */ |
| 310 | set_thread_flag(TIF_MCE_NOTIFY); | 411 | set_thread_flag(TIF_MCE_NOTIFY); |
| 311 | 412 | ||
| 312 | out: | ||
| 313 | /* the last thing we do is clear state */ | 413 | /* the last thing we do is clear state */ |
| 314 | for (i = 0; i < banks; i++) | 414 | for (i = 0; i < banks; i++) { |
| 315 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 415 | if (test_bit(i, toclear)) |
| 416 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
| 417 | } | ||
| 316 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | 418 | wrmsrl(MSR_IA32_MCG_STATUS, 0); |
| 317 | out2: | 419 | out2: |
| 318 | atomic_dec(&mce_entry); | 420 | atomic_dec(&mce_entry); |
| @@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
| 332 | * and historically has been the register value of the | 434 | * and historically has been the register value of the |
| 333 | * MSR_IA32_THERMAL_STATUS (Intel) msr. | 435 | * MSR_IA32_THERMAL_STATUS (Intel) msr. |
| 334 | */ | 436 | */ |
| 335 | void mce_log_therm_throt_event(unsigned int cpu, __u64 status) | 437 | void mce_log_therm_throt_event(__u64 status) |
| 336 | { | 438 | { |
| 337 | struct mce m; | 439 | struct mce m; |
| 338 | 440 | ||
| 339 | memset(&m, 0, sizeof(m)); | 441 | mce_setup(&m); |
| 340 | m.cpu = cpu; | ||
| 341 | m.bank = MCE_THERMAL_BANK; | 442 | m.bank = MCE_THERMAL_BANK; |
| 342 | m.status = status; | 443 | m.status = status; |
| 343 | rdtscll(m.tsc); | ||
| 344 | mce_log(&m); | 444 | mce_log(&m); |
| 345 | } | 445 | } |
| 346 | #endif /* CONFIG_X86_MCE_INTEL */ | 446 | #endif /* CONFIG_X86_MCE_INTEL */ |
| @@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) | |||
| 353 | 453 | ||
| 354 | static int check_interval = 5 * 60; /* 5 minutes */ | 454 | static int check_interval = 5 * 60; /* 5 minutes */ |
| 355 | static int next_interval; /* in jiffies */ | 455 | static int next_interval; /* in jiffies */ |
| 356 | static void mcheck_timer(struct work_struct *work); | 456 | static void mcheck_timer(unsigned long); |
| 357 | static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); | 457 | static DEFINE_PER_CPU(struct timer_list, mce_timer); |
| 358 | 458 | ||
| 359 | static void mcheck_check_cpu(void *info) | 459 | static void mcheck_timer(unsigned long data) |
| 360 | { | 460 | { |
| 361 | if (mce_available(¤t_cpu_data)) | 461 | struct timer_list *t = &per_cpu(mce_timer, data); |
| 362 | do_machine_check(NULL, 0); | ||
| 363 | } | ||
| 364 | 462 | ||
| 365 | static void mcheck_timer(struct work_struct *work) | 463 | WARN_ON(smp_processor_id() != data); |
| 366 | { | 464 | |
| 367 | on_each_cpu(mcheck_check_cpu, NULL, 1); | 465 | if (mce_available(¤t_cpu_data)) |
| 466 | machine_check_poll(MCP_TIMESTAMP, | ||
| 467 | &__get_cpu_var(mce_poll_banks)); | ||
| 368 | 468 | ||
| 369 | /* | 469 | /* |
| 370 | * Alert userspace if needed. If we logged an MCE, reduce the | 470 | * Alert userspace if needed. If we logged an MCE, reduce the |
| @@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work) | |||
| 377 | (int)round_jiffies_relative(check_interval*HZ)); | 477 | (int)round_jiffies_relative(check_interval*HZ)); |
| 378 | } | 478 | } |
| 379 | 479 | ||
| 380 | schedule_delayed_work(&mcheck_work, next_interval); | 480 | t->expires = jiffies + next_interval; |
| 481 | add_timer(t); | ||
| 482 | } | ||
| 483 | |||
| 484 | static void mce_do_trigger(struct work_struct *work) | ||
| 485 | { | ||
| 486 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | ||
| 381 | } | 487 | } |
| 382 | 488 | ||
| 489 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | ||
| 490 | |||
| 383 | /* | 491 | /* |
| 384 | * This is only called from process context. This is where we do | 492 | * Notify the user(s) about new machine check events. |
| 385 | * anything we need to alert userspace about new MCEs. This is called | 493 | * Can be called from interrupt context, but not from machine check/NMI |
| 386 | * directly from the poller and also from entry.S and idle, thanks to | 494 | * context. |
| 387 | * TIF_MCE_NOTIFY. | ||
| 388 | */ | 495 | */ |
| 389 | int mce_notify_user(void) | 496 | int mce_notify_user(void) |
| 390 | { | 497 | { |
| 498 | /* Not more than two messages every minute */ | ||
| 499 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | ||
| 500 | |||
| 391 | clear_thread_flag(TIF_MCE_NOTIFY); | 501 | clear_thread_flag(TIF_MCE_NOTIFY); |
| 392 | if (test_and_clear_bit(0, ¬ify_user)) { | 502 | if (test_and_clear_bit(0, ¬ify_user)) { |
| 393 | static unsigned long last_print; | ||
| 394 | unsigned long now = jiffies; | ||
| 395 | |||
| 396 | wake_up_interruptible(&mce_wait); | 503 | wake_up_interruptible(&mce_wait); |
| 397 | if (trigger[0]) | ||
| 398 | call_usermodehelper(trigger, trigger_argv, NULL, | ||
| 399 | UMH_NO_WAIT); | ||
| 400 | 504 | ||
| 401 | if (time_after_eq(now, last_print + (check_interval*HZ))) { | 505 | /* |
| 402 | last_print = now; | 506 | * There is no risk of missing notifications because |
| 507 | * work_pending is always cleared before the function is | ||
| 508 | * executed. | ||
| 509 | */ | ||
| 510 | if (trigger[0] && !work_pending(&mce_trigger_work)) | ||
| 511 | schedule_work(&mce_trigger_work); | ||
| 512 | |||
| 513 | if (__ratelimit(&ratelimit)) | ||
| 403 | printk(KERN_INFO "Machine check events logged\n"); | 514 | printk(KERN_INFO "Machine check events logged\n"); |
| 404 | } | ||
| 405 | 515 | ||
| 406 | return 1; | 516 | return 1; |
| 407 | } | 517 | } |
| @@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = { | |||
| 425 | 535 | ||
| 426 | static __init int periodic_mcheck_init(void) | 536 | static __init int periodic_mcheck_init(void) |
| 427 | { | 537 | { |
| 428 | next_interval = check_interval * HZ; | 538 | idle_notifier_register(&mce_idle_notifier); |
| 429 | if (next_interval) | 539 | return 0; |
| 430 | schedule_delayed_work(&mcheck_work, | ||
| 431 | round_jiffies_relative(next_interval)); | ||
| 432 | idle_notifier_register(&mce_idle_notifier); | ||
| 433 | return 0; | ||
| 434 | } | 540 | } |
| 435 | __initcall(periodic_mcheck_init); | 541 | __initcall(periodic_mcheck_init); |
| 436 | 542 | ||
| 437 | |||
| 438 | /* | 543 | /* |
| 439 | * Initialize Machine Checks for a CPU. | 544 | * Initialize Machine Checks for a CPU. |
| 440 | */ | 545 | */ |
| 441 | static void mce_init(void *dummy) | 546 | static int mce_cap_init(void) |
| 442 | { | 547 | { |
| 443 | u64 cap; | 548 | u64 cap; |
| 444 | int i; | 549 | unsigned b; |
| 445 | 550 | ||
| 446 | rdmsrl(MSR_IA32_MCG_CAP, cap); | 551 | rdmsrl(MSR_IA32_MCG_CAP, cap); |
| 447 | banks = cap & 0xff; | 552 | b = cap & 0xff; |
| 448 | if (banks > MCE_EXTENDED_BANK) { | 553 | if (b > MAX_NR_BANKS) { |
| 449 | banks = MCE_EXTENDED_BANK; | 554 | printk(KERN_WARNING |
| 450 | printk(KERN_INFO "MCE: warning: using only %d banks\n", | 555 | "MCE: Using only %u machine check banks out of %u\n", |
| 451 | MCE_EXTENDED_BANK); | 556 | MAX_NR_BANKS, b); |
| 557 | b = MAX_NR_BANKS; | ||
| 452 | } | 558 | } |
| 559 | |||
| 560 | /* Don't support asymmetric configurations today */ | ||
| 561 | WARN_ON(banks != 0 && b != banks); | ||
| 562 | banks = b; | ||
| 563 | if (!bank) { | ||
| 564 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | ||
| 565 | if (!bank) | ||
| 566 | return -ENOMEM; | ||
| 567 | memset(bank, 0xff, banks * sizeof(u64)); | ||
| 568 | } | ||
| 569 | |||
| 453 | /* Use accurate RIP reporting if available. */ | 570 | /* Use accurate RIP reporting if available. */ |
| 454 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) | 571 | if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) |
| 455 | rip_msr = MSR_IA32_MCG_EIP; | 572 | rip_msr = MSR_IA32_MCG_EIP; |
| 456 | 573 | ||
| 457 | /* Log the machine checks left over from the previous reset. | 574 | return 0; |
| 458 | This also clears all registers */ | 575 | } |
| 459 | do_machine_check(NULL, mce_bootlog ? -1 : -2); | 576 | |
| 577 | static void mce_init(void *dummy) | ||
| 578 | { | ||
| 579 | u64 cap; | ||
| 580 | int i; | ||
| 581 | mce_banks_t all_banks; | ||
| 582 | |||
| 583 | /* | ||
| 584 | * Log the machine checks left over from the previous reset. | ||
| 585 | */ | ||
| 586 | bitmap_fill(all_banks, MAX_NR_BANKS); | ||
| 587 | machine_check_poll(MCP_UC, &all_banks); | ||
| 460 | 588 | ||
| 461 | set_in_cr4(X86_CR4_MCE); | 589 | set_in_cr4(X86_CR4_MCE); |
| 462 | 590 | ||
| 591 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
| 463 | if (cap & MCG_CTL_P) | 592 | if (cap & MCG_CTL_P) |
| 464 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 593 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
| 465 | 594 | ||
| 466 | for (i = 0; i < banks; i++) { | 595 | for (i = 0; i < banks; i++) { |
| 467 | if (i < NR_SYSFS_BANKS) | 596 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); |
| 468 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | ||
| 469 | else | ||
| 470 | wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); | ||
| 471 | |||
| 472 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 597 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); |
| 473 | } | 598 | } |
| 474 | } | 599 | } |
| 475 | 600 | ||
| 476 | /* Add per CPU specific workarounds here */ | 601 | /* Add per CPU specific workarounds here */ |
| 477 | static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) | 602 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) |
| 478 | { | 603 | { |
| 479 | /* This should be disabled by the BIOS, but isn't always */ | 604 | /* This should be disabled by the BIOS, but isn't always */ |
| 480 | if (c->x86_vendor == X86_VENDOR_AMD) { | 605 | if (c->x86_vendor == X86_VENDOR_AMD) { |
| 481 | if(c->x86 == 15) | 606 | if (c->x86 == 15 && banks > 4) |
| 482 | /* disable GART TBL walk error reporting, which trips off | 607 | /* disable GART TBL walk error reporting, which trips off |
| 483 | incorrectly with the IOMMU & 3ware & Cerberus. */ | 608 | incorrectly with the IOMMU & 3ware & Cerberus. */ |
| 484 | clear_bit(10, &bank[4]); | 609 | clear_bit(10, (unsigned long *)&bank[4]); |
| 485 | if(c->x86 <= 17 && mce_bootlog < 0) | 610 | if(c->x86 <= 17 && mce_bootlog < 0) |
| 486 | /* Lots of broken BIOS around that don't clear them | 611 | /* Lots of broken BIOS around that don't clear them |
| 487 | by default and leave crap in there. Don't log. */ | 612 | by default and leave crap in there. Don't log. */ |
| @@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) | |||
| 504 | } | 629 | } |
| 505 | } | 630 | } |
| 506 | 631 | ||
| 632 | static void mce_init_timer(void) | ||
| 633 | { | ||
| 634 | struct timer_list *t = &__get_cpu_var(mce_timer); | ||
| 635 | |||
| 636 | /* data race harmless because everyone sets to the same value */ | ||
| 637 | if (!next_interval) | ||
| 638 | next_interval = check_interval * HZ; | ||
| 639 | if (!next_interval) | ||
| 640 | return; | ||
| 641 | setup_timer(t, mcheck_timer, smp_processor_id()); | ||
| 642 | t->expires = round_jiffies_relative(jiffies + next_interval); | ||
| 643 | add_timer(t); | ||
| 644 | } | ||
| 645 | |||
| 507 | /* | 646 | /* |
| 508 | * Called for each booted CPU to set up machine checks. | 647 | * Called for each booted CPU to set up machine checks. |
| 509 | * Must be called with preempt off. | 648 | * Must be called with preempt off. |
| 510 | */ | 649 | */ |
| 511 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | 650 | void __cpuinit mcheck_init(struct cpuinfo_x86 *c) |
| 512 | { | 651 | { |
| 513 | mce_cpu_quirks(c); | 652 | if (!mce_available(c)) |
| 653 | return; | ||
| 514 | 654 | ||
| 515 | if (mce_dont_init || | 655 | if (mce_cap_init() < 0) { |
| 516 | !mce_available(c)) | 656 | mce_dont_init = 1; |
| 517 | return; | 657 | return; |
| 658 | } | ||
| 659 | mce_cpu_quirks(c); | ||
| 518 | 660 | ||
| 519 | mce_init(NULL); | 661 | mce_init(NULL); |
| 520 | mce_cpu_features(c); | 662 | mce_cpu_features(c); |
| 663 | mce_init_timer(); | ||
| 521 | } | 664 | } |
| 522 | 665 | ||
| 523 | /* | 666 | /* |
| @@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
| 573 | { | 716 | { |
| 574 | unsigned long *cpu_tsc; | 717 | unsigned long *cpu_tsc; |
| 575 | static DEFINE_MUTEX(mce_read_mutex); | 718 | static DEFINE_MUTEX(mce_read_mutex); |
| 576 | unsigned next; | 719 | unsigned prev, next; |
| 577 | char __user *buf = ubuf; | 720 | char __user *buf = ubuf; |
| 578 | int i, err; | 721 | int i, err; |
| 579 | 722 | ||
| @@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, | |||
| 592 | } | 735 | } |
| 593 | 736 | ||
| 594 | err = 0; | 737 | err = 0; |
| 595 | for (i = 0; i < next; i++) { | 738 | prev = 0; |
| 596 | unsigned long start = jiffies; | 739 | do { |
| 597 | 740 | for (i = prev; i < next; i++) { | |
| 598 | while (!mcelog.entry[i].finished) { | 741 | unsigned long start = jiffies; |
| 599 | if (time_after_eq(jiffies, start + 2)) { | 742 | |
| 600 | memset(mcelog.entry + i,0, sizeof(struct mce)); | 743 | while (!mcelog.entry[i].finished) { |
| 601 | goto timeout; | 744 | if (time_after_eq(jiffies, start + 2)) { |
| 745 | memset(mcelog.entry + i, 0, | ||
| 746 | sizeof(struct mce)); | ||
| 747 | goto timeout; | ||
| 748 | } | ||
| 749 | cpu_relax(); | ||
| 602 | } | 750 | } |
| 603 | cpu_relax(); | 751 | smp_rmb(); |
| 752 | err |= copy_to_user(buf, mcelog.entry + i, | ||
| 753 | sizeof(struct mce)); | ||
| 754 | buf += sizeof(struct mce); | ||
| 755 | timeout: | ||
| 756 | ; | ||
| 604 | } | 757 | } |
| 605 | smp_rmb(); | ||
| 606 | err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); | ||
| 607 | buf += sizeof(struct mce); | ||
| 608 | timeout: | ||
| 609 | ; | ||
| 610 | } | ||
| 611 | 758 | ||
| 612 | memset(mcelog.entry, 0, next * sizeof(struct mce)); | 759 | memset(mcelog.entry + prev, 0, |
| 613 | mcelog.next = 0; | 760 | (next - prev) * sizeof(struct mce)); |
| 761 | prev = next; | ||
| 762 | next = cmpxchg(&mcelog.next, prev, 0); | ||
| 763 | } while (next != prev); | ||
| 614 | 764 | ||
| 615 | synchronize_sched(); | 765 | synchronize_sched(); |
| 616 | 766 | ||
| @@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = { | |||
| 680 | &mce_chrdev_ops, | 830 | &mce_chrdev_ops, |
| 681 | }; | 831 | }; |
| 682 | 832 | ||
| 683 | static unsigned long old_cr4 __initdata; | ||
| 684 | |||
| 685 | void __init stop_mce(void) | ||
| 686 | { | ||
| 687 | old_cr4 = read_cr4(); | ||
| 688 | clear_in_cr4(X86_CR4_MCE); | ||
| 689 | } | ||
| 690 | |||
| 691 | void __init restart_mce(void) | ||
| 692 | { | ||
| 693 | if (old_cr4 & X86_CR4_MCE) | ||
| 694 | set_in_cr4(X86_CR4_MCE); | ||
| 695 | } | ||
| 696 | |||
| 697 | /* | 833 | /* |
| 698 | * Old style boot options parsing. Only for compatibility. | 834 | * Old style boot options parsing. Only for compatibility. |
| 699 | */ | 835 | */ |
| @@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str) | |||
| 703 | return 1; | 839 | return 1; |
| 704 | } | 840 | } |
| 705 | 841 | ||
| 706 | /* mce=off disables machine check. Note you can re-enable it later | 842 | /* mce=off disables machine check. |
| 707 | using sysfs. | ||
| 708 | mce=TOLERANCELEVEL (number, see above) | 843 | mce=TOLERANCELEVEL (number, see above) |
| 709 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. | 844 | mce=bootlog Log MCEs from before booting. Disabled by default on AMD. |
| 710 | mce=nobootlog Don't log MCEs from before booting. */ | 845 | mce=nobootlog Don't log MCEs from before booting. */ |
| @@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable); | |||
| 728 | * Sysfs support | 863 | * Sysfs support |
| 729 | */ | 864 | */ |
| 730 | 865 | ||
| 866 | /* | ||
| 867 | * Disable machine checks on suspend and shutdown. We can't really handle | ||
| 868 | * them later. | ||
| 869 | */ | ||
| 870 | static int mce_disable(void) | ||
| 871 | { | ||
| 872 | int i; | ||
| 873 | |||
| 874 | for (i = 0; i < banks; i++) | ||
| 875 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
| 876 | return 0; | ||
| 877 | } | ||
| 878 | |||
| 879 | static int mce_suspend(struct sys_device *dev, pm_message_t state) | ||
| 880 | { | ||
| 881 | return mce_disable(); | ||
| 882 | } | ||
| 883 | |||
| 884 | static int mce_shutdown(struct sys_device *dev) | ||
| 885 | { | ||
| 886 | return mce_disable(); | ||
| 887 | } | ||
| 888 | |||
| 731 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. | 889 | /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. |
| 732 | Only one CPU is active at this time, the others get readded later using | 890 | Only one CPU is active at this time, the others get readded later using |
| 733 | CPU hotplug. */ | 891 | CPU hotplug. */ |
| @@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev) | |||
| 738 | return 0; | 896 | return 0; |
| 739 | } | 897 | } |
| 740 | 898 | ||
| 899 | static void mce_cpu_restart(void *data) | ||
| 900 | { | ||
| 901 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
| 902 | if (mce_available(¤t_cpu_data)) | ||
| 903 | mce_init(NULL); | ||
| 904 | mce_init_timer(); | ||
| 905 | } | ||
| 906 | |||
| 741 | /* Reinit MCEs after user configuration changes */ | 907 | /* Reinit MCEs after user configuration changes */ |
| 742 | static void mce_restart(void) | 908 | static void mce_restart(void) |
| 743 | { | 909 | { |
| 744 | if (next_interval) | ||
| 745 | cancel_delayed_work(&mcheck_work); | ||
| 746 | /* Timer race is harmless here */ | ||
| 747 | on_each_cpu(mce_init, NULL, 1); | ||
| 748 | next_interval = check_interval * HZ; | 910 | next_interval = check_interval * HZ; |
| 749 | if (next_interval) | 911 | on_each_cpu(mce_cpu_restart, NULL, 1); |
| 750 | schedule_delayed_work(&mcheck_work, | ||
| 751 | round_jiffies_relative(next_interval)); | ||
| 752 | } | 912 | } |
| 753 | 913 | ||
| 754 | static struct sysdev_class mce_sysclass = { | 914 | static struct sysdev_class mce_sysclass = { |
| 915 | .suspend = mce_suspend, | ||
| 916 | .shutdown = mce_shutdown, | ||
| 755 | .resume = mce_resume, | 917 | .resume = mce_resume, |
| 756 | .name = "machinecheck", | 918 | .name = "machinecheck", |
| 757 | }; | 919 | }; |
| @@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit | |||
| 778 | } \ | 940 | } \ |
| 779 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | 941 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); |
| 780 | 942 | ||
| 781 | /* | 943 | static struct sysdev_attribute *bank_attrs; |
| 782 | * TBD should generate these dynamically based on number of available banks. | 944 | |
| 783 | * Have only 6 contol banks in /sysfs until then. | 945 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, |
| 784 | */ | 946 | char *buf) |
| 785 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | 947 | { |
| 786 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | 948 | u64 b = bank[attr - bank_attrs]; |
| 787 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | 949 | return sprintf(buf, "%llx\n", b); |
| 788 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | 950 | } |
| 789 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | 951 | |
| 790 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | 952 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, |
| 953 | const char *buf, size_t siz) | ||
| 954 | { | ||
| 955 | char *end; | ||
| 956 | u64 new = simple_strtoull(buf, &end, 0); | ||
| 957 | if (end == buf) | ||
| 958 | return -EINVAL; | ||
| 959 | bank[attr - bank_attrs] = new; | ||
| 960 | mce_restart(); | ||
| 961 | return end-buf; | ||
| 962 | } | ||
| 791 | 963 | ||
| 792 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, | 964 | static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, |
| 793 | char *buf) | 965 | char *buf) |
| @@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | |||
| 814 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | 986 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); |
| 815 | ACCESSOR(check_interval,check_interval,mce_restart()) | 987 | ACCESSOR(check_interval,check_interval,mce_restart()) |
| 816 | static struct sysdev_attribute *mce_attributes[] = { | 988 | static struct sysdev_attribute *mce_attributes[] = { |
| 817 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | ||
| 818 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | ||
| 819 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, | 989 | &attr_tolerant.attr, &attr_check_interval, &attr_trigger, |
| 820 | NULL | 990 | NULL |
| 821 | }; | 991 | }; |
| @@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
| 845 | if (err) | 1015 | if (err) |
| 846 | goto error; | 1016 | goto error; |
| 847 | } | 1017 | } |
| 1018 | for (i = 0; i < banks; i++) { | ||
| 1019 | err = sysdev_create_file(&per_cpu(device_mce, cpu), | ||
| 1020 | &bank_attrs[i]); | ||
| 1021 | if (err) | ||
| 1022 | goto error2; | ||
| 1023 | } | ||
| 848 | cpu_set(cpu, mce_device_initialized); | 1024 | cpu_set(cpu, mce_device_initialized); |
| 849 | 1025 | ||
| 850 | return 0; | 1026 | return 0; |
| 1027 | error2: | ||
| 1028 | while (--i >= 0) { | ||
| 1029 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
| 1030 | &bank_attrs[i]); | ||
| 1031 | } | ||
| 851 | error: | 1032 | error: |
| 852 | while (i--) { | 1033 | while (--i >= 0) { |
| 853 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 1034 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
| 854 | mce_attributes[i]); | 1035 | mce_attributes[i]); |
| 855 | } | 1036 | } |
| @@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu) | |||
| 868 | for (i = 0; mce_attributes[i]; i++) | 1049 | for (i = 0; mce_attributes[i]; i++) |
| 869 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 1050 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
| 870 | mce_attributes[i]); | 1051 | mce_attributes[i]); |
| 1052 | for (i = 0; i < banks; i++) | ||
| 1053 | sysdev_remove_file(&per_cpu(device_mce, cpu), | ||
| 1054 | &bank_attrs[i]); | ||
| 871 | sysdev_unregister(&per_cpu(device_mce,cpu)); | 1055 | sysdev_unregister(&per_cpu(device_mce,cpu)); |
| 872 | cpu_clear(cpu, mce_device_initialized); | 1056 | cpu_clear(cpu, mce_device_initialized); |
| 873 | } | 1057 | } |
| 874 | 1058 | ||
| 1059 | /* Make sure there are no machine checks on offlined CPUs. */ | ||
| 1060 | static void mce_disable_cpu(void *h) | ||
| 1061 | { | ||
| 1062 | int i; | ||
| 1063 | unsigned long action = *(unsigned long *)h; | ||
| 1064 | |||
| 1065 | if (!mce_available(¤t_cpu_data)) | ||
| 1066 | return; | ||
| 1067 | if (!(action & CPU_TASKS_FROZEN)) | ||
| 1068 | cmci_clear(); | ||
| 1069 | for (i = 0; i < banks; i++) | ||
| 1070 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | static void mce_reenable_cpu(void *h) | ||
| 1074 | { | ||
| 1075 | int i; | ||
| 1076 | unsigned long action = *(unsigned long *)h; | ||
| 1077 | |||
| 1078 | if (!mce_available(¤t_cpu_data)) | ||
| 1079 | return; | ||
| 1080 | if (!(action & CPU_TASKS_FROZEN)) | ||
| 1081 | cmci_reenable(); | ||
| 1082 | for (i = 0; i < banks; i++) | ||
| 1083 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | ||
| 1084 | } | ||
| 1085 | |||
| 875 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ | 1086 | /* Get notified when a cpu comes on/off. Be hotplug friendly. */ |
| 876 | static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, | 1087 | static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, |
| 877 | unsigned long action, void *hcpu) | 1088 | unsigned long action, void *hcpu) |
| 878 | { | 1089 | { |
| 879 | unsigned int cpu = (unsigned long)hcpu; | 1090 | unsigned int cpu = (unsigned long)hcpu; |
| 1091 | struct timer_list *t = &per_cpu(mce_timer, cpu); | ||
| 880 | 1092 | ||
| 881 | switch (action) { | 1093 | switch (action) { |
| 882 | case CPU_ONLINE: | 1094 | case CPU_ONLINE: |
| @@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, | |||
| 891 | threshold_cpu_callback(action, cpu); | 1103 | threshold_cpu_callback(action, cpu); |
| 892 | mce_remove_device(cpu); | 1104 | mce_remove_device(cpu); |
| 893 | break; | 1105 | break; |
| 1106 | case CPU_DOWN_PREPARE: | ||
| 1107 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 1108 | del_timer_sync(t); | ||
| 1109 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | ||
| 1110 | break; | ||
| 1111 | case CPU_DOWN_FAILED: | ||
| 1112 | case CPU_DOWN_FAILED_FROZEN: | ||
| 1113 | t->expires = round_jiffies_relative(jiffies + next_interval); | ||
| 1114 | add_timer_on(t, cpu); | ||
| 1115 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | ||
| 1116 | break; | ||
| 1117 | case CPU_POST_DEAD: | ||
| 1118 | /* intentionally ignoring frozen here */ | ||
| 1119 | cmci_rediscover(cpu); | ||
| 1120 | break; | ||
| 894 | } | 1121 | } |
| 895 | return NOTIFY_OK; | 1122 | return NOTIFY_OK; |
| 896 | } | 1123 | } |
| @@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { | |||
| 899 | .notifier_call = mce_cpu_callback, | 1126 | .notifier_call = mce_cpu_callback, |
| 900 | }; | 1127 | }; |
| 901 | 1128 | ||
| 1129 | static __init int mce_init_banks(void) | ||
| 1130 | { | ||
| 1131 | int i; | ||
| 1132 | |||
| 1133 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
| 1134 | GFP_KERNEL); | ||
| 1135 | if (!bank_attrs) | ||
| 1136 | return -ENOMEM; | ||
| 1137 | |||
| 1138 | for (i = 0; i < banks; i++) { | ||
| 1139 | struct sysdev_attribute *a = &bank_attrs[i]; | ||
| 1140 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | ||
| 1141 | if (!a->attr.name) | ||
| 1142 | goto nomem; | ||
| 1143 | a->attr.mode = 0644; | ||
| 1144 | a->show = show_bank; | ||
| 1145 | a->store = set_bank; | ||
| 1146 | } | ||
| 1147 | return 0; | ||
| 1148 | |||
| 1149 | nomem: | ||
| 1150 | while (--i >= 0) | ||
| 1151 | kfree(bank_attrs[i].attr.name); | ||
| 1152 | kfree(bank_attrs); | ||
| 1153 | bank_attrs = NULL; | ||
| 1154 | return -ENOMEM; | ||
| 1155 | } | ||
| 1156 | |||
| 902 | static __init int mce_init_device(void) | 1157 | static __init int mce_init_device(void) |
| 903 | { | 1158 | { |
| 904 | int err; | 1159 | int err; |
| @@ -906,6 +1161,11 @@ static __init int mce_init_device(void) | |||
| 906 | 1161 | ||
| 907 | if (!mce_available(&boot_cpu_data)) | 1162 | if (!mce_available(&boot_cpu_data)) |
| 908 | return -EIO; | 1163 | return -EIO; |
| 1164 | |||
| 1165 | err = mce_init_banks(); | ||
| 1166 | if (err) | ||
| 1167 | return err; | ||
| 1168 | |||
| 909 | err = sysdev_class_register(&mce_sysclass); | 1169 | err = sysdev_class_register(&mce_sysclass); |
| 910 | if (err) | 1170 | if (err) |
| 911 | return err; | 1171 | return err; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 9817506dd469..c5a32f92d07e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c | |||
| @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { | |||
| 79 | 79 | ||
| 80 | static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ | 80 | static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ |
| 81 | 81 | ||
| 82 | static void amd_threshold_interrupt(void); | ||
| 83 | |||
| 82 | /* | 84 | /* |
| 83 | * CPU Initialization | 85 | * CPU Initialization |
| 84 | */ | 86 | */ |
| @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
| 174 | tr.reset = 0; | 176 | tr.reset = 0; |
| 175 | tr.old_limit = 0; | 177 | tr.old_limit = 0; |
| 176 | threshold_restart_bank(&tr); | 178 | threshold_restart_bank(&tr); |
| 179 | |||
| 180 | mce_threshold_vector = amd_threshold_interrupt; | ||
| 177 | } | 181 | } |
| 178 | } | 182 | } |
| 179 | } | 183 | } |
| @@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
| 187 | * the interrupt goes off when error_count reaches threshold_limit. | 191 | * the interrupt goes off when error_count reaches threshold_limit. |
| 188 | * the handler will simply log mcelog w/ software defined bank number. | 192 | * the handler will simply log mcelog w/ software defined bank number. |
| 189 | */ | 193 | */ |
| 190 | asmlinkage void mce_threshold_interrupt(void) | 194 | static void amd_threshold_interrupt(void) |
| 191 | { | 195 | { |
| 192 | unsigned int bank, block; | 196 | unsigned int bank, block; |
| 193 | struct mce m; | 197 | struct mce m; |
| 194 | u32 low = 0, high = 0, address = 0; | 198 | u32 low = 0, high = 0, address = 0; |
| 195 | 199 | ||
| 196 | ack_APIC_irq(); | 200 | mce_setup(&m); |
| 197 | exit_idle(); | ||
| 198 | irq_enter(); | ||
| 199 | |||
| 200 | memset(&m, 0, sizeof(m)); | ||
| 201 | rdtscll(m.tsc); | ||
| 202 | m.cpu = smp_processor_id(); | ||
| 203 | 201 | ||
| 204 | /* assume first bank caused it */ | 202 | /* assume first bank caused it */ |
| 205 | for (bank = 0; bank < NR_BANKS; ++bank) { | 203 | for (bank = 0; bank < NR_BANKS; ++bank) { |
| @@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void) | |||
| 233 | 231 | ||
| 234 | /* Log the machine check that caused the threshold | 232 | /* Log the machine check that caused the threshold |
| 235 | event. */ | 233 | event. */ |
| 236 | do_machine_check(NULL, 0); | 234 | machine_check_poll(MCP_TIMESTAMP, |
| 235 | &__get_cpu_var(mce_poll_banks)); | ||
| 237 | 236 | ||
| 238 | if (high & MASK_OVERFLOW_HI) { | 237 | if (high & MASK_OVERFLOW_HI) { |
| 239 | rdmsrl(address, m.misc); | 238 | rdmsrl(address, m.misc); |
| @@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void) | |||
| 243 | + bank * NR_BLOCKS | 242 | + bank * NR_BLOCKS |
| 244 | + block; | 243 | + block; |
| 245 | mce_log(&m); | 244 | mce_log(&m); |
| 246 | goto out; | 245 | return; |
| 247 | } | 246 | } |
| 248 | } | 247 | } |
| 249 | } | 248 | } |
| 250 | out: | ||
| 251 | inc_irq_stat(irq_threshold_count); | ||
| 252 | irq_exit(); | ||
| 253 | } | 249 | } |
| 254 | 250 | ||
| 255 | /* | 251 | /* |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aa5e287c98e0..aaa7d9730938 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Intel specific MCE features. | 2 | * Intel specific MCE features. |
| 3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | 3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> |
| 4 | * Copyright (C) 2008, 2009 Intel Corporation | ||
| 5 | * Author: Andi Kleen | ||
| 4 | */ | 6 | */ |
| 5 | 7 | ||
| 6 | #include <linux/init.h> | 8 | #include <linux/init.h> |
| @@ -13,6 +15,7 @@ | |||
| 13 | #include <asm/hw_irq.h> | 15 | #include <asm/hw_irq.h> |
| 14 | #include <asm/idle.h> | 16 | #include <asm/idle.h> |
| 15 | #include <asm/therm_throt.h> | 17 | #include <asm/therm_throt.h> |
| 18 | #include <asm/apic.h> | ||
| 16 | 19 | ||
| 17 | asmlinkage void smp_thermal_interrupt(void) | 20 | asmlinkage void smp_thermal_interrupt(void) |
| 18 | { | 21 | { |
| @@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void) | |||
| 25 | 28 | ||
| 26 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 29 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
| 27 | if (therm_throt_process(msr_val & 1)) | 30 | if (therm_throt_process(msr_val & 1)) |
| 28 | mce_log_therm_throt_event(smp_processor_id(), msr_val); | 31 | mce_log_therm_throt_event(msr_val); |
| 29 | 32 | ||
| 30 | inc_irq_stat(irq_thermal_count); | 33 | inc_irq_stat(irq_thermal_count); |
| 31 | irq_exit(); | 34 | irq_exit(); |
| @@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) | |||
| 85 | return; | 88 | return; |
| 86 | } | 89 | } |
| 87 | 90 | ||
| 91 | /* | ||
| 92 | * Support for Intel Correct Machine Check Interrupts. This allows | ||
| 93 | * the CPU to raise an interrupt when a corrected machine check happened. | ||
| 94 | * Normally we pick those up using a regular polling timer. | ||
| 95 | * Also supports reliable discovery of shared banks. | ||
| 96 | */ | ||
| 97 | |||
| 98 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); | ||
| 99 | |||
| 100 | /* | ||
| 101 | * cmci_discover_lock protects against parallel discovery attempts | ||
| 102 | * which could race against each other. | ||
| 103 | */ | ||
| 104 | static DEFINE_SPINLOCK(cmci_discover_lock); | ||
| 105 | |||
| 106 | #define CMCI_THRESHOLD 1 | ||
| 107 | |||
| 108 | static int cmci_supported(int *banks) | ||
| 109 | { | ||
| 110 | u64 cap; | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Vendor check is not strictly needed, but the initial | ||
| 114 | * initialization is vendor keyed and this | ||
| 115 | * makes sure none of the backdoors are entered otherwise. | ||
| 116 | */ | ||
| 117 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) | ||
| 118 | return 0; | ||
| 119 | if (!cpu_has_apic || lapic_get_maxlvt() < 6) | ||
| 120 | return 0; | ||
| 121 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
| 122 | *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); | ||
| 123 | return !!(cap & MCG_CMCI_P); | ||
| 124 | } | ||
| 125 | |||
| 126 | /* | ||
| 127 | * The interrupt handler. This is called on every event. | ||
| 128 | * Just call the poller directly to log any events. | ||
| 129 | * This could in theory increase the threshold under high load, | ||
| 130 | * but doesn't for now. | ||
| 131 | */ | ||
| 132 | static void intel_threshold_interrupt(void) | ||
| 133 | { | ||
| 134 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
| 135 | mce_notify_user(); | ||
| 136 | } | ||
| 137 | |||
| 138 | static void print_update(char *type, int *hdr, int num) | ||
| 139 | { | ||
| 140 | if (*hdr == 0) | ||
| 141 | printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); | ||
| 142 | *hdr = 1; | ||
| 143 | printk(KERN_CONT " %s:%d", type, num); | ||
| 144 | } | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks | ||
| 148 | * on this CPU. Use the algorithm recommended in the SDM to discover shared | ||
| 149 | * banks. | ||
| 150 | */ | ||
| 151 | static void cmci_discover(int banks, int boot) | ||
| 152 | { | ||
| 153 | unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); | ||
| 154 | int hdr = 0; | ||
| 155 | int i; | ||
| 156 | |||
| 157 | spin_lock(&cmci_discover_lock); | ||
| 158 | for (i = 0; i < banks; i++) { | ||
| 159 | u64 val; | ||
| 160 | |||
| 161 | if (test_bit(i, owned)) | ||
| 162 | continue; | ||
| 163 | |||
| 164 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 165 | |||
| 166 | /* Already owned by someone else? */ | ||
| 167 | if (val & CMCI_EN) { | ||
| 168 | if (test_and_clear_bit(i, owned) || boot) | ||
| 169 | print_update("SHD", &hdr, i); | ||
| 170 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
| 171 | continue; | ||
| 172 | } | ||
| 173 | |||
| 174 | val |= CMCI_EN | CMCI_THRESHOLD; | ||
| 175 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 176 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 177 | |||
| 178 | /* Did the enable bit stick? -- the bank supports CMCI */ | ||
| 179 | if (val & CMCI_EN) { | ||
| 180 | if (!test_and_set_bit(i, owned) || boot) | ||
| 181 | print_update("CMCI", &hdr, i); | ||
| 182 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
| 183 | } else { | ||
| 184 | WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); | ||
| 185 | } | ||
| 186 | } | ||
| 187 | spin_unlock(&cmci_discover_lock); | ||
| 188 | if (hdr) | ||
| 189 | printk(KERN_CONT "\n"); | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * Just in case we missed an event during initialization check | ||
| 194 | * all the CMCI owned banks. | ||
| 195 | */ | ||
| 196 | void cmci_recheck(void) | ||
| 197 | { | ||
| 198 | unsigned long flags; | ||
| 199 | int banks; | ||
| 200 | |||
| 201 | if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) | ||
| 202 | return; | ||
| 203 | local_irq_save(flags); | ||
| 204 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
| 205 | local_irq_restore(flags); | ||
| 206 | } | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Disable CMCI on this CPU for all banks it owns when it goes down. | ||
| 210 | * This allows other CPUs to claim the banks on rediscovery. | ||
| 211 | */ | ||
| 212 | void cmci_clear(void) | ||
| 213 | { | ||
| 214 | int i; | ||
| 215 | int banks; | ||
| 216 | u64 val; | ||
| 217 | |||
| 218 | if (!cmci_supported(&banks)) | ||
| 219 | return; | ||
| 220 | spin_lock(&cmci_discover_lock); | ||
| 221 | for (i = 0; i < banks; i++) { | ||
| 222 | if (!test_bit(i, __get_cpu_var(mce_banks_owned))) | ||
| 223 | continue; | ||
| 224 | /* Disable CMCI */ | ||
| 225 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 226 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | ||
| 227 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
| 228 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | ||
| 229 | } | ||
| 230 | spin_unlock(&cmci_discover_lock); | ||
| 231 | } | ||
| 232 | |||
| 233 | /* | ||
| 234 | * After a CPU went down cycle through all the others and rediscover | ||
| 235 | * Must run in process context. | ||
| 236 | */ | ||
| 237 | void cmci_rediscover(int dying) | ||
| 238 | { | ||
| 239 | int banks; | ||
| 240 | int cpu; | ||
| 241 | cpumask_var_t old; | ||
| 242 | |||
| 243 | if (!cmci_supported(&banks)) | ||
| 244 | return; | ||
| 245 | if (!alloc_cpumask_var(&old, GFP_KERNEL)) | ||
| 246 | return; | ||
| 247 | cpumask_copy(old, ¤t->cpus_allowed); | ||
| 248 | |||
| 249 | for_each_online_cpu (cpu) { | ||
| 250 | if (cpu == dying) | ||
| 251 | continue; | ||
| 252 | if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) | ||
| 253 | continue; | ||
| 254 | /* Recheck banks in case CPUs don't all have the same */ | ||
| 255 | if (cmci_supported(&banks)) | ||
| 256 | cmci_discover(banks, 0); | ||
| 257 | } | ||
| 258 | |||
| 259 | set_cpus_allowed_ptr(current, old); | ||
| 260 | free_cpumask_var(old); | ||
| 261 | } | ||
| 262 | |||
| 263 | /* | ||
| 264 | * Reenable CMCI on this CPU in case a CPU down failed. | ||
| 265 | */ | ||
| 266 | void cmci_reenable(void) | ||
| 267 | { | ||
| 268 | int banks; | ||
| 269 | if (cmci_supported(&banks)) | ||
| 270 | cmci_discover(banks, 0); | ||
| 271 | } | ||
| 272 | |||
| 273 | static __cpuinit void intel_init_cmci(void) | ||
| 274 | { | ||
| 275 | int banks; | ||
| 276 | |||
| 277 | if (!cmci_supported(&banks)) | ||
| 278 | return; | ||
| 279 | |||
| 280 | mce_threshold_vector = intel_threshold_interrupt; | ||
| 281 | cmci_discover(banks, 1); | ||
| 282 | /* | ||
| 283 | * For CPU #0 this runs with still disabled APIC, but that's | ||
| 284 | * ok because only the vector is set up. We still do another | ||
| 285 | * check for the banks later for CPU #0 just to make sure | ||
| 286 | * to not miss any events. | ||
| 287 | */ | ||
| 288 | apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); | ||
| 289 | cmci_recheck(); | ||
| 290 | } | ||
| 291 | |||
| 88 | void mce_intel_feature_init(struct cpuinfo_x86 *c) | 292 | void mce_intel_feature_init(struct cpuinfo_x86 *c) |
| 89 | { | 293 | { |
| 90 | intel_init_thermal(c); | 294 | intel_init_thermal(c); |
| 295 | intel_init_cmci(); | ||
| 91 | } | 296 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 000000000000..23ee9e730f78 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | /* | ||
| 2 | * Common corrected MCE threshold handler code: | ||
| 3 | */ | ||
| 4 | #include <linux/interrupt.h> | ||
| 5 | #include <linux/kernel.h> | ||
| 6 | |||
| 7 | #include <asm/irq_vectors.h> | ||
| 8 | #include <asm/apic.h> | ||
| 9 | #include <asm/idle.h> | ||
| 10 | #include <asm/mce.h> | ||
| 11 | |||
| 12 | static void default_threshold_interrupt(void) | ||
| 13 | { | ||
| 14 | printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", | ||
| 15 | THRESHOLD_APIC_VECTOR); | ||
| 16 | } | ||
| 17 | |||
| 18 | void (*mce_threshold_vector)(void) = default_threshold_interrupt; | ||
| 19 | |||
| 20 | asmlinkage void mce_threshold_interrupt(void) | ||
| 21 | { | ||
| 22 | exit_idle(); | ||
| 23 | irq_enter(); | ||
| 24 | inc_irq_stat(irq_threshold_count); | ||
| 25 | mce_threshold_vector(); | ||
| 26 | irq_exit(); | ||
| 27 | /* Ack only at the end to avoid potential reentry */ | ||
| 28 | ack_APIC_irq(); | ||
| 29 | } | ||
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index b205272ad394..1736acc4d7aa 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
| @@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void) | |||
| 469 | efi_memory_desc_t *md; | 469 | efi_memory_desc_t *md; |
| 470 | efi_status_t status; | 470 | efi_status_t status; |
| 471 | unsigned long size; | 471 | unsigned long size; |
| 472 | u64 end, systab, addr, npages; | 472 | u64 end, systab, addr, npages, end_pfn; |
| 473 | void *p, *va; | 473 | void *p, *va; |
| 474 | 474 | ||
| 475 | efi.systab = NULL; | 475 | efi.systab = NULL; |
| @@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void) | |||
| 481 | size = md->num_pages << EFI_PAGE_SHIFT; | 481 | size = md->num_pages << EFI_PAGE_SHIFT; |
| 482 | end = md->phys_addr + size; | 482 | end = md->phys_addr + size; |
| 483 | 483 | ||
| 484 | if (PFN_UP(end) <= max_low_pfn_mapped) | 484 | end_pfn = PFN_UP(end); |
| 485 | if (end_pfn <= max_low_pfn_mapped | ||
| 486 | || (end_pfn > (1UL << (32 - PAGE_SHIFT)) | ||
| 487 | && end_pfn <= max_pfn_mapped)) | ||
| 485 | va = __va(md->phys_addr); | 488 | va = __va(md->phys_addr); |
| 486 | else | 489 | else |
| 487 | va = efi_ioremap(md->phys_addr, size); | 490 | va = efi_ioremap(md->phys_addr, size); |
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index a4ee29127fdf..22c3b7828c50 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c | |||
| @@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void) | |||
| 100 | 100 | ||
| 101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) | 101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) |
| 102 | { | 102 | { |
| 103 | static unsigned pages_mapped __initdata; | 103 | unsigned long last_map_pfn; |
| 104 | unsigned i, pages; | ||
| 105 | unsigned long offset; | ||
| 106 | 104 | ||
| 107 | pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); | 105 | last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); |
| 108 | offset = phys_addr & ~PAGE_MASK; | 106 | if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) |
| 109 | phys_addr &= PAGE_MASK; | ||
| 110 | |||
| 111 | if (pages_mapped + pages > MAX_EFI_IO_PAGES) | ||
| 112 | return NULL; | 107 | return NULL; |
| 113 | 108 | ||
| 114 | for (i = 0; i < pages; i++) { | 109 | return (void __iomem *)__va(phys_addr); |
| 115 | __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, | ||
| 116 | phys_addr, PAGE_KERNEL); | ||
| 117 | phys_addr += PAGE_SIZE; | ||
| 118 | pages_mapped++; | ||
| 119 | } | ||
| 120 | |||
| 121 | return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ | ||
| 122 | (pages_mapped - pages)) + offset; | ||
| 123 | } | 110 | } |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b0f61f0dcd0a..f2f8540a7f3d 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
| @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) | |||
| 136 | #ifdef CONFIG_X86_32 | 136 | #ifdef CONFIG_X86_32 |
| 137 | if (!HAVE_HWFP) { | 137 | if (!HAVE_HWFP) { |
| 138 | memset(tsk->thread.xstate, 0, xstate_size); | 138 | memset(tsk->thread.xstate, 0, xstate_size); |
| 139 | finit(); | 139 | finit_task(tsk); |
| 140 | set_stopped_child_used_math(tsk); | 140 | set_stopped_child_used_math(tsk); |
| 141 | return 0; | 141 | return 0; |
| 142 | } | 142 | } |
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 37cb1bda1baf..e8192401da47 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
| @@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) | |||
| 558 | 558 | ||
| 559 | static struct mpf_intel *mpf_found; | 559 | static struct mpf_intel *mpf_found; |
| 560 | 560 | ||
| 561 | static unsigned long __init get_mpc_size(unsigned long physptr) | ||
| 562 | { | ||
| 563 | struct mpc_table *mpc; | ||
| 564 | unsigned long size; | ||
| 565 | |||
| 566 | mpc = early_ioremap(physptr, PAGE_SIZE); | ||
| 567 | size = mpc->length; | ||
| 568 | early_iounmap(mpc, PAGE_SIZE); | ||
| 569 | apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); | ||
| 570 | |||
| 571 | return size; | ||
| 572 | } | ||
| 573 | |||
| 561 | /* | 574 | /* |
| 562 | * Scan the memory blocks for an SMP configuration block. | 575 | * Scan the memory blocks for an SMP configuration block. |
| 563 | */ | 576 | */ |
| @@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early) | |||
| 611 | construct_default_ISA_mptable(mpf->feature1); | 624 | construct_default_ISA_mptable(mpf->feature1); |
| 612 | 625 | ||
| 613 | } else if (mpf->physptr) { | 626 | } else if (mpf->physptr) { |
| 627 | struct mpc_table *mpc; | ||
| 628 | unsigned long size; | ||
| 614 | 629 | ||
| 630 | size = get_mpc_size(mpf->physptr); | ||
| 631 | mpc = early_ioremap(mpf->physptr, size); | ||
| 615 | /* | 632 | /* |
| 616 | * Read the physical hardware table. Anything here will | 633 | * Read the physical hardware table. Anything here will |
| 617 | * override the defaults. | 634 | * override the defaults. |
| 618 | */ | 635 | */ |
| 619 | if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { | 636 | if (!smp_read_mpc(mpc, early)) { |
| 620 | #ifdef CONFIG_X86_LOCAL_APIC | 637 | #ifdef CONFIG_X86_LOCAL_APIC |
| 621 | smp_found_config = 0; | 638 | smp_found_config = 0; |
| 622 | #endif | 639 | #endif |
| @@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early) | |||
| 624 | "BIOS bug, MP table errors detected!...\n"); | 641 | "BIOS bug, MP table errors detected!...\n"); |
| 625 | printk(KERN_ERR "... disabling SMP support. " | 642 | printk(KERN_ERR "... disabling SMP support. " |
| 626 | "(tell your hw vendor)\n"); | 643 | "(tell your hw vendor)\n"); |
| 644 | early_iounmap(mpc, size); | ||
| 627 | return; | 645 | return; |
| 628 | } | 646 | } |
| 647 | early_iounmap(mpc, size); | ||
| 629 | 648 | ||
| 630 | if (early) | 649 | if (early) |
| 631 | return; | 650 | return; |
| @@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
| 697 | 716 | ||
| 698 | if (!reserve) | 717 | if (!reserve) |
| 699 | return 1; | 718 | return 1; |
| 700 | reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, | 719 | reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), |
| 701 | BOOTMEM_DEFAULT); | 720 | BOOTMEM_DEFAULT); |
| 702 | if (mpf->physptr) { | 721 | if (mpf->physptr) { |
| 703 | unsigned long size = PAGE_SIZE; | 722 | unsigned long size = get_mpc_size(mpf->physptr); |
| 704 | #ifdef CONFIG_X86_32 | 723 | #ifdef CONFIG_X86_32 |
| 705 | /* | 724 | /* |
| 706 | * We cannot access to MPC table to compute | 725 | * We cannot access to MPC table to compute |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1cc18d439bbb..2aef36d8aca2 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
| @@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
| 216 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), | 216 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), |
| 217 | }, | 217 | }, |
| 218 | }, | 218 | }, |
| 219 | { /* Handle problems with rebooting on Dell XPS710 */ | ||
| 220 | .callback = set_bios_reboot, | ||
| 221 | .ident = "Dell XPS710", | ||
| 222 | .matches = { | ||
| 223 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), | ||
| 224 | DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), | ||
| 225 | }, | ||
| 226 | }, | ||
| 219 | { } | 227 | { } |
| 220 | }; | 228 | }; |
| 221 | 229 | ||
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4c54bc0d8ff3..f28c56e6bf94 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
| @@ -202,7 +202,9 @@ struct ist_info ist_info; | |||
| 202 | #endif | 202 | #endif |
| 203 | 203 | ||
| 204 | #else | 204 | #else |
| 205 | struct cpuinfo_x86 boot_cpu_data __read_mostly; | 205 | struct cpuinfo_x86 boot_cpu_data __read_mostly = { |
| 206 | .x86_phys_bits = MAX_PHYSMEM_BITS, | ||
| 207 | }; | ||
| 206 | EXPORT_SYMBOL(boot_cpu_data); | 208 | EXPORT_SYMBOL(boot_cpu_data); |
| 207 | #endif | 209 | #endif |
| 208 | 210 | ||
| @@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p) | |||
| 770 | 772 | ||
| 771 | finish_e820_parsing(); | 773 | finish_e820_parsing(); |
| 772 | 774 | ||
| 775 | if (efi_enabled) | ||
| 776 | efi_init(); | ||
| 777 | |||
| 773 | dmi_scan_machine(); | 778 | dmi_scan_machine(); |
| 774 | 779 | ||
| 775 | dmi_check_system(bad_bios_dmi_table); | 780 | dmi_check_system(bad_bios_dmi_table); |
| @@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p) | |||
| 789 | insert_resource(&iomem_resource, &data_resource); | 794 | insert_resource(&iomem_resource, &data_resource); |
| 790 | insert_resource(&iomem_resource, &bss_resource); | 795 | insert_resource(&iomem_resource, &bss_resource); |
| 791 | 796 | ||
| 792 | if (efi_enabled) | ||
| 793 | efi_init(); | ||
| 794 | 797 | ||
| 795 | #ifdef CONFIG_X86_32 | 798 | #ifdef CONFIG_X86_32 |
| 796 | if (ppro_with_ram_bug()) { | 799 | if (ppro_with_ram_bug()) { |
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 491e737ce547..aa0987088774 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c | |||
| @@ -30,20 +30,29 @@ static void fclex(void) | |||
| 30 | } | 30 | } |
| 31 | 31 | ||
| 32 | /* Needs to be externally visible */ | 32 | /* Needs to be externally visible */ |
| 33 | void finit(void) | 33 | void finit_task(struct task_struct *tsk) |
| 34 | { | 34 | { |
| 35 | control_word = 0x037f; | 35 | struct i387_soft_struct *soft = &tsk->thread.xstate->soft; |
| 36 | partial_status = 0; | 36 | struct address *oaddr, *iaddr; |
| 37 | top = 0; /* We don't keep top in the status word internally. */ | 37 | soft->cwd = 0x037f; |
| 38 | fpu_tag_word = 0xffff; | 38 | soft->swd = 0; |
| 39 | soft->ftop = 0; /* We don't keep top in the status word internally. */ | ||
| 40 | soft->twd = 0xffff; | ||
| 39 | /* The behaviour is different from that detailed in | 41 | /* The behaviour is different from that detailed in |
| 40 | Section 15.1.6 of the Intel manual */ | 42 | Section 15.1.6 of the Intel manual */ |
| 41 | operand_address.offset = 0; | 43 | oaddr = (struct address *)&soft->foo; |
| 42 | operand_address.selector = 0; | 44 | oaddr->offset = 0; |
| 43 | instruction_address.offset = 0; | 45 | oaddr->selector = 0; |
| 44 | instruction_address.selector = 0; | 46 | iaddr = (struct address *)&soft->fip; |
| 45 | instruction_address.opcode = 0; | 47 | iaddr->offset = 0; |
| 46 | no_ip_update = 1; | 48 | iaddr->selector = 0; |
| 49 | iaddr->opcode = 0; | ||
| 50 | soft->no_update = 1; | ||
| 51 | } | ||
| 52 | |||
| 53 | void finit(void) | ||
| 54 | { | ||
| 55 | finit_task(current); | ||
| 47 | } | 56 | } |
| 48 | 57 | ||
| 49 | /* | 58 | /* |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0e..d11745334a67 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
| @@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap); | |||
| 158 | EXPORT_SYMBOL(kmap_atomic); | 158 | EXPORT_SYMBOL(kmap_atomic); |
| 159 | EXPORT_SYMBOL(kunmap_atomic); | 159 | EXPORT_SYMBOL(kunmap_atomic); |
| 160 | 160 | ||
| 161 | #ifdef CONFIG_NUMA | ||
| 162 | void __init set_highmem_pages_init(void) | 161 | void __init set_highmem_pages_init(void) |
| 163 | { | 162 | { |
| 164 | struct zone *zone; | 163 | struct zone *zone; |
| @@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void) | |||
| 182 | } | 181 | } |
| 183 | totalram_pages += totalhigh_pages; | 182 | totalram_pages += totalhigh_pages; |
| 184 | } | 183 | } |
| 185 | #else | ||
| 186 | void __init set_highmem_pages_init(void) | ||
| 187 | { | ||
| 188 | add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); | ||
| 189 | |||
| 190 | totalram_pages += totalhigh_pages; | ||
| 191 | } | ||
| 192 | #endif /* CONFIG_NUMA */ | ||
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ce6a722587d8..6d63e3d1253d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
| @@ -1,8 +1,345 @@ | |||
| 1 | #include <linux/ioport.h> | ||
| 1 | #include <linux/swap.h> | 2 | #include <linux/swap.h> |
| 3 | |||
| 2 | #include <asm/cacheflush.h> | 4 | #include <asm/cacheflush.h> |
| 5 | #include <asm/e820.h> | ||
| 6 | #include <asm/init.h> | ||
| 3 | #include <asm/page.h> | 7 | #include <asm/page.h> |
| 8 | #include <asm/page_types.h> | ||
| 4 | #include <asm/sections.h> | 9 | #include <asm/sections.h> |
| 5 | #include <asm/system.h> | 10 | #include <asm/system.h> |
| 11 | #include <asm/tlbflush.h> | ||
| 12 | |||
| 13 | unsigned long __initdata e820_table_start; | ||
| 14 | unsigned long __meminitdata e820_table_end; | ||
| 15 | unsigned long __meminitdata e820_table_top; | ||
| 16 | |||
| 17 | int after_bootmem; | ||
| 18 | |||
| 19 | int direct_gbpages | ||
| 20 | #ifdef CONFIG_DIRECT_GBPAGES | ||
| 21 | = 1 | ||
| 22 | #endif | ||
| 23 | ; | ||
| 24 | |||
| 25 | static void __init find_early_table_space(unsigned long end, int use_pse, | ||
| 26 | int use_gbpages) | ||
| 27 | { | ||
| 28 | unsigned long puds, pmds, ptes, tables, start; | ||
| 29 | |||
| 30 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
| 31 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
| 32 | |||
| 33 | if (use_gbpages) { | ||
| 34 | unsigned long extra; | ||
| 35 | |||
| 36 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | ||
| 37 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 38 | } else | ||
| 39 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 40 | |||
| 41 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
| 42 | |||
| 43 | if (use_pse) { | ||
| 44 | unsigned long extra; | ||
| 45 | |||
| 46 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
| 47 | #ifdef CONFIG_X86_32 | ||
| 48 | extra += PMD_SIZE; | ||
| 49 | #endif | ||
| 50 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 51 | } else | ||
| 52 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 53 | |||
| 54 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | ||
| 55 | |||
| 56 | #ifdef CONFIG_X86_32 | ||
| 57 | /* for fixmap */ | ||
| 58 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | ||
| 59 | #endif | ||
| 60 | |||
| 61 | /* | ||
| 62 | * RED-PEN putting page tables only on node 0 could | ||
| 63 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
| 64 | * need roughly 0.5KB per GB. | ||
| 65 | */ | ||
| 66 | #ifdef CONFIG_X86_32 | ||
| 67 | start = 0x7000; | ||
| 68 | e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | ||
| 69 | tables, PAGE_SIZE); | ||
| 70 | #else /* CONFIG_X86_64 */ | ||
| 71 | start = 0x8000; | ||
| 72 | e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE); | ||
| 73 | #endif | ||
| 74 | if (e820_table_start == -1UL) | ||
| 75 | panic("Cannot find space for the kernel page tables"); | ||
| 76 | |||
| 77 | e820_table_start >>= PAGE_SHIFT; | ||
| 78 | e820_table_end = e820_table_start; | ||
| 79 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); | ||
| 80 | |||
| 81 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
| 82 | end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); | ||
| 83 | } | ||
| 84 | |||
| 85 | struct map_range { | ||
| 86 | unsigned long start; | ||
| 87 | unsigned long end; | ||
| 88 | unsigned page_size_mask; | ||
| 89 | }; | ||
| 90 | |||
| 91 | #ifdef CONFIG_X86_32 | ||
| 92 | #define NR_RANGE_MR 3 | ||
| 93 | #else /* CONFIG_X86_64 */ | ||
| 94 | #define NR_RANGE_MR 5 | ||
| 95 | #endif | ||
| 96 | |||
| 97 | static int save_mr(struct map_range *mr, int nr_range, | ||
| 98 | unsigned long start_pfn, unsigned long end_pfn, | ||
| 99 | unsigned long page_size_mask) | ||
| 100 | { | ||
| 101 | if (start_pfn < end_pfn) { | ||
| 102 | if (nr_range >= NR_RANGE_MR) | ||
| 103 | panic("run out of range for init_memory_mapping\n"); | ||
| 104 | mr[nr_range].start = start_pfn<<PAGE_SHIFT; | ||
| 105 | mr[nr_range].end = end_pfn<<PAGE_SHIFT; | ||
| 106 | mr[nr_range].page_size_mask = page_size_mask; | ||
| 107 | nr_range++; | ||
| 108 | } | ||
| 109 | |||
| 110 | return nr_range; | ||
| 111 | } | ||
| 112 | |||
| 113 | #ifdef CONFIG_X86_64 | ||
| 114 | static void __init init_gbpages(void) | ||
| 115 | { | ||
| 116 | if (direct_gbpages && cpu_has_gbpages) | ||
| 117 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
| 118 | else | ||
| 119 | direct_gbpages = 0; | ||
| 120 | } | ||
| 121 | #else | ||
| 122 | static inline void init_gbpages(void) | ||
| 123 | { | ||
| 124 | } | ||
| 125 | #endif | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
| 129 | * This runs before bootmem is initialized and gets pages directly from | ||
| 130 | * the physical memory. To access them they are temporarily mapped. | ||
| 131 | */ | ||
| 132 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
| 133 | unsigned long end) | ||
| 134 | { | ||
| 135 | unsigned long page_size_mask = 0; | ||
| 136 | unsigned long start_pfn, end_pfn; | ||
| 137 | unsigned long pos; | ||
| 138 | unsigned long ret; | ||
| 139 | |||
| 140 | struct map_range mr[NR_RANGE_MR]; | ||
| 141 | int nr_range, i; | ||
| 142 | int use_pse, use_gbpages; | ||
| 143 | |||
| 144 | printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); | ||
| 145 | |||
| 146 | if (!after_bootmem) | ||
| 147 | init_gbpages(); | ||
| 148 | |||
| 149 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 150 | /* | ||
| 151 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
| 152 | * This will simplify cpa(), which otherwise needs to support splitting | ||
| 153 | * large pages into small in interrupt context, etc. | ||
| 154 | */ | ||
| 155 | use_pse = use_gbpages = 0; | ||
| 156 | #else | ||
| 157 | use_pse = cpu_has_pse; | ||
| 158 | use_gbpages = direct_gbpages; | ||
| 159 | #endif | ||
| 160 | |||
| 161 | #ifdef CONFIG_X86_32 | ||
| 162 | #ifdef CONFIG_X86_PAE | ||
| 163 | set_nx(); | ||
| 164 | if (nx_enabled) | ||
| 165 | printk(KERN_INFO "NX (Execute Disable) protection: active\n"); | ||
| 166 | #endif | ||
| 167 | |||
| 168 | /* Enable PSE if available */ | ||
| 169 | if (cpu_has_pse) | ||
| 170 | set_in_cr4(X86_CR4_PSE); | ||
| 171 | |||
| 172 | /* Enable PGE if available */ | ||
| 173 | if (cpu_has_pge) { | ||
| 174 | set_in_cr4(X86_CR4_PGE); | ||
| 175 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
| 176 | } | ||
| 177 | #endif | ||
| 178 | |||
| 179 | if (use_gbpages) | ||
| 180 | page_size_mask |= 1 << PG_LEVEL_1G; | ||
| 181 | if (use_pse) | ||
| 182 | page_size_mask |= 1 << PG_LEVEL_2M; | ||
| 183 | |||
| 184 | memset(mr, 0, sizeof(mr)); | ||
| 185 | nr_range = 0; | ||
| 186 | |||
| 187 | /* head if not big page alignment ? */ | ||
| 188 | start_pfn = start >> PAGE_SHIFT; | ||
| 189 | pos = start_pfn << PAGE_SHIFT; | ||
| 190 | #ifdef CONFIG_X86_32 | ||
| 191 | /* | ||
| 192 | * Don't use a large page for the first 2/4MB of memory | ||
| 193 | * because there are often fixed size MTRRs in there | ||
| 194 | * and overlapping MTRRs into large pages can cause | ||
| 195 | * slowdowns. | ||
| 196 | */ | ||
| 197 | if (pos == 0) | ||
| 198 | end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); | ||
| 199 | else | ||
| 200 | end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 201 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 202 | #else /* CONFIG_X86_64 */ | ||
| 203 | end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) | ||
| 204 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 205 | #endif | ||
| 206 | if (end_pfn > (end >> PAGE_SHIFT)) | ||
| 207 | end_pfn = end >> PAGE_SHIFT; | ||
| 208 | if (start_pfn < end_pfn) { | ||
| 209 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 210 | pos = end_pfn << PAGE_SHIFT; | ||
| 211 | } | ||
| 212 | |||
| 213 | /* big page (2M) range */ | ||
| 214 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 215 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 216 | #ifdef CONFIG_X86_32 | ||
| 217 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 218 | #else /* CONFIG_X86_64 */ | ||
| 219 | end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 220 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 221 | if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) | ||
| 222 | end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); | ||
| 223 | #endif | ||
| 224 | |||
| 225 | if (start_pfn < end_pfn) { | ||
| 226 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 227 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 228 | pos = end_pfn << PAGE_SHIFT; | ||
| 229 | } | ||
| 230 | |||
| 231 | #ifdef CONFIG_X86_64 | ||
| 232 | /* big page (1G) range */ | ||
| 233 | start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 234 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 235 | end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); | ||
| 236 | if (start_pfn < end_pfn) { | ||
| 237 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 238 | page_size_mask & | ||
| 239 | ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); | ||
| 240 | pos = end_pfn << PAGE_SHIFT; | ||
| 241 | } | ||
| 242 | |||
| 243 | /* tail is not big page (1G) alignment */ | ||
| 244 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 245 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 246 | end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 247 | if (start_pfn < end_pfn) { | ||
| 248 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 249 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 250 | pos = end_pfn << PAGE_SHIFT; | ||
| 251 | } | ||
| 252 | #endif | ||
| 253 | |||
| 254 | /* tail is not big page (2M) alignment */ | ||
| 255 | start_pfn = pos>>PAGE_SHIFT; | ||
| 256 | end_pfn = end>>PAGE_SHIFT; | ||
| 257 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 258 | |||
| 259 | /* try to merge same page size and continuous */ | ||
| 260 | for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { | ||
| 261 | unsigned long old_start; | ||
| 262 | if (mr[i].end != mr[i+1].start || | ||
| 263 | mr[i].page_size_mask != mr[i+1].page_size_mask) | ||
| 264 | continue; | ||
| 265 | /* move it */ | ||
| 266 | old_start = mr[i].start; | ||
| 267 | memmove(&mr[i], &mr[i+1], | ||
| 268 | (nr_range - 1 - i) * sizeof(struct map_range)); | ||
| 269 | mr[i--].start = old_start; | ||
| 270 | nr_range--; | ||
| 271 | } | ||
| 272 | |||
| 273 | for (i = 0; i < nr_range; i++) | ||
| 274 | printk(KERN_DEBUG " %010lx - %010lx page %s\n", | ||
| 275 | mr[i].start, mr[i].end, | ||
| 276 | (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( | ||
| 277 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | ||
| 278 | |||
| 279 | /* | ||
| 280 | * Find space for the kernel direct mapping tables. | ||
| 281 | * | ||
| 282 | * Later we should allocate these tables in the local node of the | ||
| 283 | * memory mapped. Unfortunately this is done currently before the | ||
| 284 | * nodes are discovered. | ||
| 285 | */ | ||
| 286 | if (!after_bootmem) | ||
| 287 | find_early_table_space(end, use_pse, use_gbpages); | ||
| 288 | |||
| 289 | #ifdef CONFIG_X86_32 | ||
| 290 | for (i = 0; i < nr_range; i++) | ||
| 291 | kernel_physical_mapping_init(mr[i].start, mr[i].end, | ||
| 292 | mr[i].page_size_mask); | ||
| 293 | ret = end; | ||
| 294 | #else /* CONFIG_X86_64 */ | ||
| 295 | for (i = 0; i < nr_range; i++) | ||
| 296 | ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, | ||
| 297 | mr[i].page_size_mask); | ||
| 298 | #endif | ||
| 299 | |||
| 300 | #ifdef CONFIG_X86_32 | ||
| 301 | early_ioremap_page_table_range_init(); | ||
| 302 | |||
| 303 | load_cr3(swapper_pg_dir); | ||
| 304 | #endif | ||
| 305 | |||
| 306 | #ifdef CONFIG_X86_64 | ||
| 307 | if (!after_bootmem) | ||
| 308 | mmu_cr4_features = read_cr4(); | ||
| 309 | #endif | ||
| 310 | __flush_tlb_all(); | ||
| 311 | |||
| 312 | if (!after_bootmem && e820_table_end > e820_table_start) | ||
| 313 | reserve_early(e820_table_start << PAGE_SHIFT, | ||
| 314 | e820_table_end << PAGE_SHIFT, "PGTABLE"); | ||
| 315 | |||
| 316 | if (!after_bootmem) | ||
| 317 | early_memtest(start, end); | ||
| 318 | |||
| 319 | return ret >> PAGE_SHIFT; | ||
| 320 | } | ||
| 321 | |||
| 322 | |||
| 323 | /* | ||
| 324 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 325 | * is valid. The argument is a physical page number. | ||
| 326 | * | ||
| 327 | * | ||
| 328 | * On x86, access has to be given to the first megabyte of ram because that area | ||
| 329 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
| 330 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
| 331 | * mmio resources as well as potential bios/acpi data regions. | ||
| 332 | */ | ||
| 333 | int devmem_is_allowed(unsigned long pagenr) | ||
| 334 | { | ||
| 335 | if (pagenr <= 256) | ||
| 336 | return 1; | ||
| 337 | if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) | ||
| 338 | return 0; | ||
| 339 | if (!page_is_ram(pagenr)) | ||
| 340 | return 1; | ||
| 341 | return 0; | ||
| 342 | } | ||
| 6 | 343 | ||
| 7 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | 344 | void free_init_pages(char *what, unsigned long begin, unsigned long end) |
| 8 | { | 345 | { |
| @@ -47,3 +384,10 @@ void free_initmem(void) | |||
| 47 | (unsigned long)(&__init_begin), | 384 | (unsigned long)(&__init_begin), |
| 48 | (unsigned long)(&__init_end)); | 385 | (unsigned long)(&__init_end)); |
| 49 | } | 386 | } |
| 387 | |||
| 388 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 389 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 390 | { | ||
| 391 | free_init_pages("initrd memory", start, end); | ||
| 392 | } | ||
| 393 | #endif | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 47df0e1bbeb9..2966c6b8d304 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
| @@ -49,6 +49,7 @@ | |||
| 49 | #include <asm/paravirt.h> | 49 | #include <asm/paravirt.h> |
| 50 | #include <asm/setup.h> | 50 | #include <asm/setup.h> |
| 51 | #include <asm/cacheflush.h> | 51 | #include <asm/cacheflush.h> |
| 52 | #include <asm/init.h> | ||
| 52 | 53 | ||
| 53 | unsigned long max_low_pfn_mapped; | 54 | unsigned long max_low_pfn_mapped; |
| 54 | unsigned long max_pfn_mapped; | 55 | unsigned long max_pfn_mapped; |
| @@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn; | |||
| 58 | 59 | ||
| 59 | static noinline int do_test_wp_bit(void); | 60 | static noinline int do_test_wp_bit(void); |
| 60 | 61 | ||
| 61 | 62 | bool __read_mostly __vmalloc_start_set = false; | |
| 62 | static unsigned long __initdata table_start; | ||
| 63 | static unsigned long __meminitdata table_end; | ||
| 64 | static unsigned long __meminitdata table_top; | ||
| 65 | |||
| 66 | static int __initdata after_init_bootmem; | ||
| 67 | 63 | ||
| 68 | static __init void *alloc_low_page(void) | 64 | static __init void *alloc_low_page(void) |
| 69 | { | 65 | { |
| 70 | unsigned long pfn = table_end++; | 66 | unsigned long pfn = e820_table_end++; |
| 71 | void *adr; | 67 | void *adr; |
| 72 | 68 | ||
| 73 | if (pfn >= table_top) | 69 | if (pfn >= e820_table_top) |
| 74 | panic("alloc_low_page: ran out of memory"); | 70 | panic("alloc_low_page: ran out of memory"); |
| 75 | 71 | ||
| 76 | adr = __va(pfn * PAGE_SIZE); | 72 | adr = __va(pfn * PAGE_SIZE); |
| @@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) | |||
| 90 | 86 | ||
| 91 | #ifdef CONFIG_X86_PAE | 87 | #ifdef CONFIG_X86_PAE |
| 92 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { | 88 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { |
| 93 | if (after_init_bootmem) | 89 | if (after_bootmem) |
| 94 | pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | 90 | pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); |
| 95 | else | 91 | else |
| 96 | pmd_table = (pmd_t *)alloc_low_page(); | 92 | pmd_table = (pmd_t *)alloc_low_page(); |
| @@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
| 117 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { | 113 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { |
| 118 | pte_t *page_table = NULL; | 114 | pte_t *page_table = NULL; |
| 119 | 115 | ||
| 120 | if (after_init_bootmem) { | 116 | if (after_bootmem) { |
| 121 | #ifdef CONFIG_DEBUG_PAGEALLOC | 117 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 122 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); | 118 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); |
| 123 | #endif | 119 | #endif |
| @@ -168,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | |||
| 168 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end | 164 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end |
| 169 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin | 165 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin |
| 170 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end | 166 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end |
| 171 | && ((__pa(pte) >> PAGE_SHIFT) < table_start | 167 | && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start |
| 172 | || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { | 168 | || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { |
| 173 | pte_t *newpte; | 169 | pte_t *newpte; |
| 174 | int i; | 170 | int i; |
| 175 | 171 | ||
| 176 | BUG_ON(after_init_bootmem); | 172 | BUG_ON(after_bootmem); |
| 177 | newpte = alloc_low_page(); | 173 | newpte = alloc_low_page(); |
| 178 | for (i = 0; i < PTRS_PER_PTE; i++) | 174 | for (i = 0; i < PTRS_PER_PTE; i++) |
| 179 | set_pte(newpte + i, pte[i]); | 175 | set_pte(newpte + i, pte[i]); |
| @@ -242,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr) | |||
| 242 | * of max_low_pfn pages, by creating page tables starting from address | 238 | * of max_low_pfn pages, by creating page tables starting from address |
| 243 | * PAGE_OFFSET: | 239 | * PAGE_OFFSET: |
| 244 | */ | 240 | */ |
| 245 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | 241 | unsigned long __init |
| 246 | unsigned long start_pfn, | 242 | kernel_physical_mapping_init(unsigned long start, |
| 247 | unsigned long end_pfn, | 243 | unsigned long end, |
| 248 | int use_pse) | 244 | unsigned long page_size_mask) |
| 249 | { | 245 | { |
| 246 | int use_pse = page_size_mask == (1<<PG_LEVEL_2M); | ||
| 247 | unsigned long start_pfn, end_pfn; | ||
| 248 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 250 | int pgd_idx, pmd_idx, pte_ofs; | 249 | int pgd_idx, pmd_idx, pte_ofs; |
| 251 | unsigned long pfn; | 250 | unsigned long pfn; |
| 252 | pgd_t *pgd; | 251 | pgd_t *pgd; |
| @@ -255,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | |||
| 255 | unsigned pages_2m, pages_4k; | 254 | unsigned pages_2m, pages_4k; |
| 256 | int mapping_iter; | 255 | int mapping_iter; |
| 257 | 256 | ||
| 257 | start_pfn = start >> PAGE_SHIFT; | ||
| 258 | end_pfn = end >> PAGE_SHIFT; | ||
| 259 | |||
| 258 | /* | 260 | /* |
| 259 | * First iteration will setup identity mapping using large/small pages | 261 | * First iteration will setup identity mapping using large/small pages |
| 260 | * based on use_pse, with other attributes same as set by | 262 | * based on use_pse, with other attributes same as set by |
| @@ -369,26 +371,6 @@ repeat: | |||
| 369 | mapping_iter = 2; | 371 | mapping_iter = 2; |
| 370 | goto repeat; | 372 | goto repeat; |
| 371 | } | 373 | } |
| 372 | } | ||
| 373 | |||
| 374 | /* | ||
| 375 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 376 | * is valid. The argument is a physical page number. | ||
| 377 | * | ||
| 378 | * | ||
| 379 | * On x86, access has to be given to the first megabyte of ram because that area | ||
| 380 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
| 381 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
| 382 | * mmio resources as well as potential bios/acpi data regions. | ||
| 383 | */ | ||
| 384 | int devmem_is_allowed(unsigned long pagenr) | ||
| 385 | { | ||
| 386 | if (pagenr <= 256) | ||
| 387 | return 1; | ||
| 388 | if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) | ||
| 389 | return 0; | ||
| 390 | if (!page_is_ram(pagenr)) | ||
| 391 | return 1; | ||
| 392 | return 0; | 374 | return 0; |
| 393 | } | 375 | } |
| 394 | 376 | ||
| @@ -545,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base) | |||
| 545 | * be partially populated, and so it avoids stomping on any existing | 527 | * be partially populated, and so it avoids stomping on any existing |
| 546 | * mappings. | 528 | * mappings. |
| 547 | */ | 529 | */ |
| 548 | static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) | 530 | void __init early_ioremap_page_table_range_init(void) |
| 549 | { | 531 | { |
| 532 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 550 | unsigned long vaddr, end; | 533 | unsigned long vaddr, end; |
| 551 | 534 | ||
| 552 | /* | 535 | /* |
| @@ -641,7 +624,7 @@ static int __init noexec_setup(char *str) | |||
| 641 | } | 624 | } |
| 642 | early_param("noexec", noexec_setup); | 625 | early_param("noexec", noexec_setup); |
| 643 | 626 | ||
| 644 | static void __init set_nx(void) | 627 | void __init set_nx(void) |
| 645 | { | 628 | { |
| 646 | unsigned int v[4], l, h; | 629 | unsigned int v[4], l, h; |
| 647 | 630 | ||
| @@ -793,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn, | |||
| 793 | #ifdef CONFIG_FLATMEM | 776 | #ifdef CONFIG_FLATMEM |
| 794 | max_mapnr = num_physpages; | 777 | max_mapnr = num_physpages; |
| 795 | #endif | 778 | #endif |
| 779 | __vmalloc_start_set = true; | ||
| 780 | |||
| 796 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | 781 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", |
| 797 | pages_to_mb(max_low_pfn)); | 782 | pages_to_mb(max_low_pfn)); |
| 798 | 783 | ||
| @@ -814,176 +799,61 @@ static void __init zone_sizes_init(void) | |||
| 814 | free_area_init_nodes(max_zone_pfns); | 799 | free_area_init_nodes(max_zone_pfns); |
| 815 | } | 800 | } |
| 816 | 801 | ||
| 802 | static unsigned long __init setup_node_bootmem(int nodeid, | ||
| 803 | unsigned long start_pfn, | ||
| 804 | unsigned long end_pfn, | ||
| 805 | unsigned long bootmap) | ||
| 806 | { | ||
| 807 | unsigned long bootmap_size; | ||
| 808 | |||
| 809 | if (start_pfn > max_low_pfn) | ||
| 810 | return bootmap; | ||
| 811 | if (end_pfn > max_low_pfn) | ||
| 812 | end_pfn = max_low_pfn; | ||
| 813 | |||
| 814 | /* don't touch min_low_pfn */ | ||
| 815 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
| 816 | bootmap >> PAGE_SHIFT, | ||
| 817 | start_pfn, end_pfn); | ||
| 818 | printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", | ||
| 819 | nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
| 820 | printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", | ||
| 821 | nodeid, bootmap, bootmap + bootmap_size); | ||
| 822 | free_bootmem_with_active_regions(nodeid, end_pfn); | ||
| 823 | early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
| 824 | |||
| 825 | return bootmap + bootmap_size; | ||
| 826 | } | ||
| 827 | |||
| 817 | void __init setup_bootmem_allocator(void) | 828 | void __init setup_bootmem_allocator(void) |
| 818 | { | 829 | { |
| 819 | int i; | 830 | int nodeid; |
| 820 | unsigned long bootmap_size, bootmap; | 831 | unsigned long bootmap_size, bootmap; |
| 821 | /* | 832 | /* |
| 822 | * Initialize the boot-time allocator (with low memory only): | 833 | * Initialize the boot-time allocator (with low memory only): |
| 823 | */ | 834 | */ |
| 824 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; | 835 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; |
| 825 | bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, | 836 | bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, |
| 826 | max_pfn_mapped<<PAGE_SHIFT, bootmap_size, | ||
| 827 | PAGE_SIZE); | 837 | PAGE_SIZE); |
| 828 | if (bootmap == -1L) | 838 | if (bootmap == -1L) |
| 829 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | 839 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); |
| 830 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | 840 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); |
| 831 | 841 | ||
| 832 | /* don't touch min_low_pfn */ | ||
| 833 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, | ||
| 834 | min_low_pfn, max_low_pfn); | ||
| 835 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", | 842 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", |
| 836 | max_pfn_mapped<<PAGE_SHIFT); | 843 | max_pfn_mapped<<PAGE_SHIFT); |
| 837 | printk(KERN_INFO " low ram: %08lx - %08lx\n", | 844 | printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); |
| 838 | min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); | ||
| 839 | printk(KERN_INFO " bootmap %08lx - %08lx\n", | ||
| 840 | bootmap, bootmap + bootmap_size); | ||
| 841 | for_each_online_node(i) | ||
| 842 | free_bootmem_with_active_regions(i, max_low_pfn); | ||
| 843 | early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); | ||
| 844 | |||
| 845 | after_init_bootmem = 1; | ||
| 846 | } | ||
| 847 | |||
| 848 | static void __init find_early_table_space(unsigned long end, int use_pse) | ||
| 849 | { | ||
| 850 | unsigned long puds, pmds, ptes, tables, start; | ||
| 851 | |||
| 852 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
| 853 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
| 854 | |||
| 855 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 856 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
| 857 | |||
| 858 | if (use_pse) { | ||
| 859 | unsigned long extra; | ||
| 860 | |||
| 861 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
| 862 | extra += PMD_SIZE; | ||
| 863 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 864 | } else | ||
| 865 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 866 | 845 | ||
| 867 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | 846 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
| 868 | 847 | for_each_online_node(nodeid) | |
| 869 | /* for fixmap */ | 848 | bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid], |
| 870 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | 849 | node_end_pfn[nodeid], bootmap); |
| 871 | |||
| 872 | /* | ||
| 873 | * RED-PEN putting page tables only on node 0 could | ||
| 874 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
| 875 | * need roughly 0.5KB per GB. | ||
| 876 | */ | ||
| 877 | start = 0x7000; | ||
| 878 | table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | ||
| 879 | tables, PAGE_SIZE); | ||
| 880 | if (table_start == -1UL) | ||
| 881 | panic("Cannot find space for the kernel page tables"); | ||
| 882 | |||
| 883 | table_start >>= PAGE_SHIFT; | ||
| 884 | table_end = table_start; | ||
| 885 | table_top = table_start + (tables>>PAGE_SHIFT); | ||
| 886 | |||
| 887 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
| 888 | end, table_start << PAGE_SHIFT, | ||
| 889 | (table_start << PAGE_SHIFT) + tables); | ||
| 890 | } | ||
| 891 | |||
| 892 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
| 893 | unsigned long end) | ||
| 894 | { | ||
| 895 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 896 | unsigned long start_pfn, end_pfn; | ||
| 897 | unsigned long big_page_start; | ||
| 898 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 899 | /* | ||
| 900 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
| 901 | * This will simplify cpa(), which otherwise needs to support splitting | ||
| 902 | * large pages into small in interrupt context, etc. | ||
| 903 | */ | ||
| 904 | int use_pse = 0; | ||
| 905 | #else | 850 | #else |
| 906 | int use_pse = cpu_has_pse; | 851 | bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap); |
| 907 | #endif | ||
| 908 | |||
| 909 | /* | ||
| 910 | * Find space for the kernel direct mapping tables. | ||
| 911 | */ | ||
| 912 | if (!after_init_bootmem) | ||
| 913 | find_early_table_space(end, use_pse); | ||
| 914 | |||
| 915 | #ifdef CONFIG_X86_PAE | ||
| 916 | set_nx(); | ||
| 917 | if (nx_enabled) | ||
| 918 | printk(KERN_INFO "NX (Execute Disable) protection: active\n"); | ||
| 919 | #endif | 852 | #endif |
| 920 | 853 | ||
| 921 | /* Enable PSE if available */ | 854 | after_bootmem = 1; |
| 922 | if (cpu_has_pse) | ||
| 923 | set_in_cr4(X86_CR4_PSE); | ||
| 924 | |||
| 925 | /* Enable PGE if available */ | ||
| 926 | if (cpu_has_pge) { | ||
| 927 | set_in_cr4(X86_CR4_PGE); | ||
| 928 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
| 929 | } | ||
| 930 | |||
| 931 | /* | ||
| 932 | * Don't use a large page for the first 2/4MB of memory | ||
| 933 | * because there are often fixed size MTRRs in there | ||
| 934 | * and overlapping MTRRs into large pages can cause | ||
| 935 | * slowdowns. | ||
| 936 | */ | ||
| 937 | big_page_start = PMD_SIZE; | ||
| 938 | |||
| 939 | if (start < big_page_start) { | ||
| 940 | start_pfn = start >> PAGE_SHIFT; | ||
| 941 | end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); | ||
| 942 | } else { | ||
| 943 | /* head is not big page alignment ? */ | ||
| 944 | start_pfn = start >> PAGE_SHIFT; | ||
| 945 | end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 946 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 947 | } | ||
| 948 | if (start_pfn < end_pfn) | ||
| 949 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); | ||
| 950 | |||
| 951 | /* big page range */ | ||
| 952 | start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 953 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 954 | if (start_pfn < (big_page_start >> PAGE_SHIFT)) | ||
| 955 | start_pfn = big_page_start >> PAGE_SHIFT; | ||
| 956 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 957 | if (start_pfn < end_pfn) | ||
| 958 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, | ||
| 959 | use_pse); | ||
| 960 | |||
| 961 | /* tail is not big page alignment ? */ | ||
| 962 | start_pfn = end_pfn; | ||
| 963 | if (start_pfn > (big_page_start>>PAGE_SHIFT)) { | ||
| 964 | end_pfn = end >> PAGE_SHIFT; | ||
| 965 | if (start_pfn < end_pfn) | ||
| 966 | kernel_physical_mapping_init(pgd_base, start_pfn, | ||
| 967 | end_pfn, 0); | ||
| 968 | } | ||
| 969 | |||
| 970 | early_ioremap_page_table_range_init(pgd_base); | ||
| 971 | |||
| 972 | load_cr3(swapper_pg_dir); | ||
| 973 | |||
| 974 | __flush_tlb_all(); | ||
| 975 | |||
| 976 | if (!after_init_bootmem) | ||
| 977 | reserve_early(table_start << PAGE_SHIFT, | ||
| 978 | table_end << PAGE_SHIFT, "PGTABLE"); | ||
| 979 | |||
| 980 | if (!after_init_bootmem) | ||
| 981 | early_memtest(start, end); | ||
| 982 | |||
| 983 | return end >> PAGE_SHIFT; | ||
| 984 | } | 855 | } |
| 985 | 856 | ||
| 986 | |||
| 987 | /* | 857 | /* |
| 988 | * paging_init() sets up the page tables - note that the first 8MB are | 858 | * paging_init() sets up the page tables - note that the first 8MB are |
| 989 | * already mapped by head.S. | 859 | * already mapped by head.S. |
| @@ -1217,13 +1087,6 @@ void mark_rodata_ro(void) | |||
| 1217 | } | 1087 | } |
| 1218 | #endif | 1088 | #endif |
| 1219 | 1089 | ||
| 1220 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 1221 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 1222 | { | ||
| 1223 | free_init_pages("initrd memory", start, end); | ||
| 1224 | } | ||
| 1225 | #endif | ||
| 1226 | |||
| 1227 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | 1090 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, |
| 1228 | int flags) | 1091 | int flags) |
| 1229 | { | 1092 | { |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 07f44d491df1..8a853bc3b287 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
| @@ -48,6 +48,7 @@ | |||
| 48 | #include <asm/kdebug.h> | 48 | #include <asm/kdebug.h> |
| 49 | #include <asm/numa.h> | 49 | #include <asm/numa.h> |
| 50 | #include <asm/cacheflush.h> | 50 | #include <asm/cacheflush.h> |
| 51 | #include <asm/init.h> | ||
| 51 | 52 | ||
| 52 | /* | 53 | /* |
| 53 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | 54 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. |
| @@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata; | |||
| 61 | 62 | ||
| 62 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 63 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
| 63 | 64 | ||
| 64 | int direct_gbpages | ||
| 65 | #ifdef CONFIG_DIRECT_GBPAGES | ||
| 66 | = 1 | ||
| 67 | #endif | ||
| 68 | ; | ||
| 69 | |||
| 70 | static int __init parse_direct_gbpages_off(char *arg) | 65 | static int __init parse_direct_gbpages_off(char *arg) |
| 71 | { | 66 | { |
| 72 | direct_gbpages = 0; | 67 | direct_gbpages = 0; |
| @@ -87,8 +82,6 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
| 87 | * around without checking the pgd every time. | 82 | * around without checking the pgd every time. |
| 88 | */ | 83 | */ |
| 89 | 84 | ||
| 90 | int after_bootmem; | ||
| 91 | |||
| 92 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; | 85 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; |
| 93 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 86 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
| 94 | 87 | ||
| @@ -325,13 +318,9 @@ void __init cleanup_highmap(void) | |||
| 325 | } | 318 | } |
| 326 | } | 319 | } |
| 327 | 320 | ||
| 328 | static unsigned long __initdata table_start; | ||
| 329 | static unsigned long __meminitdata table_end; | ||
| 330 | static unsigned long __meminitdata table_top; | ||
| 331 | |||
| 332 | static __ref void *alloc_low_page(unsigned long *phys) | 321 | static __ref void *alloc_low_page(unsigned long *phys) |
| 333 | { | 322 | { |
| 334 | unsigned long pfn = table_end++; | 323 | unsigned long pfn = e820_table_end++; |
| 335 | void *adr; | 324 | void *adr; |
| 336 | 325 | ||
| 337 | if (after_bootmem) { | 326 | if (after_bootmem) { |
| @@ -341,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
| 341 | return adr; | 330 | return adr; |
| 342 | } | 331 | } |
| 343 | 332 | ||
| 344 | if (pfn >= table_top) | 333 | if (pfn >= e820_table_top) |
| 345 | panic("alloc_low_page: ran out of memory"); | 334 | panic("alloc_low_page: ran out of memory"); |
| 346 | 335 | ||
| 347 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); | 336 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
| @@ -581,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
| 581 | return phys_pud_init(pud, addr, end, page_size_mask); | 570 | return phys_pud_init(pud, addr, end, page_size_mask); |
| 582 | } | 571 | } |
| 583 | 572 | ||
| 584 | static void __init find_early_table_space(unsigned long end, int use_pse, | 573 | unsigned long __init |
| 585 | int use_gbpages) | 574 | kernel_physical_mapping_init(unsigned long start, |
| 586 | { | 575 | unsigned long end, |
| 587 | unsigned long puds, pmds, ptes, tables, start; | 576 | unsigned long page_size_mask) |
| 588 | |||
| 589 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
| 590 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
| 591 | if (use_gbpages) { | ||
| 592 | unsigned long extra; | ||
| 593 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | ||
| 594 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 595 | } else | ||
| 596 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
| 597 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
| 598 | |||
| 599 | if (use_pse) { | ||
| 600 | unsigned long extra; | ||
| 601 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
| 602 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 603 | } else | ||
| 604 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 605 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | ||
| 606 | |||
| 607 | /* | ||
| 608 | * RED-PEN putting page tables only on node 0 could | ||
| 609 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
| 610 | * need roughly 0.5KB per GB. | ||
| 611 | */ | ||
| 612 | start = 0x8000; | ||
| 613 | table_start = find_e820_area(start, end, tables, PAGE_SIZE); | ||
| 614 | if (table_start == -1UL) | ||
| 615 | panic("Cannot find space for the kernel page tables"); | ||
| 616 | |||
| 617 | table_start >>= PAGE_SHIFT; | ||
| 618 | table_end = table_start; | ||
| 619 | table_top = table_start + (tables >> PAGE_SHIFT); | ||
| 620 | |||
| 621 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
| 622 | end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); | ||
| 623 | } | ||
| 624 | |||
| 625 | static void __init init_gbpages(void) | ||
| 626 | { | ||
| 627 | if (direct_gbpages && cpu_has_gbpages) | ||
| 628 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
| 629 | else | ||
| 630 | direct_gbpages = 0; | ||
| 631 | } | ||
| 632 | |||
| 633 | static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, | ||
| 634 | unsigned long end, | ||
| 635 | unsigned long page_size_mask) | ||
| 636 | { | 577 | { |
| 637 | 578 | ||
| 638 | unsigned long next, last_map_addr = end; | 579 | unsigned long next, last_map_addr = end; |
| @@ -669,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, | |||
| 669 | return last_map_addr; | 610 | return last_map_addr; |
| 670 | } | 611 | } |
| 671 | 612 | ||
| 672 | struct map_range { | ||
| 673 | unsigned long start; | ||
| 674 | unsigned long end; | ||
| 675 | unsigned page_size_mask; | ||
| 676 | }; | ||
| 677 | |||
| 678 | #define NR_RANGE_MR 5 | ||
| 679 | |||
| 680 | static int save_mr(struct map_range *mr, int nr_range, | ||
| 681 | unsigned long start_pfn, unsigned long end_pfn, | ||
| 682 | unsigned long page_size_mask) | ||
| 683 | { | ||
| 684 | |||
| 685 | if (start_pfn < end_pfn) { | ||
| 686 | if (nr_range >= NR_RANGE_MR) | ||
| 687 | panic("run out of range for init_memory_mapping\n"); | ||
| 688 | mr[nr_range].start = start_pfn<<PAGE_SHIFT; | ||
| 689 | mr[nr_range].end = end_pfn<<PAGE_SHIFT; | ||
| 690 | mr[nr_range].page_size_mask = page_size_mask; | ||
| 691 | nr_range++; | ||
| 692 | } | ||
| 693 | |||
| 694 | return nr_range; | ||
| 695 | } | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. | ||
| 699 | * This runs before bootmem is initialized and gets pages directly from | ||
| 700 | * the physical memory. To access them they are temporarily mapped. | ||
| 701 | */ | ||
| 702 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
| 703 | unsigned long end) | ||
| 704 | { | ||
| 705 | unsigned long last_map_addr = 0; | ||
| 706 | unsigned long page_size_mask = 0; | ||
| 707 | unsigned long start_pfn, end_pfn; | ||
| 708 | unsigned long pos; | ||
| 709 | |||
| 710 | struct map_range mr[NR_RANGE_MR]; | ||
| 711 | int nr_range, i; | ||
| 712 | int use_pse, use_gbpages; | ||
| 713 | |||
| 714 | printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); | ||
| 715 | |||
| 716 | /* | ||
| 717 | * Find space for the kernel direct mapping tables. | ||
| 718 | * | ||
| 719 | * Later we should allocate these tables in the local node of the | ||
| 720 | * memory mapped. Unfortunately this is done currently before the | ||
| 721 | * nodes are discovered. | ||
| 722 | */ | ||
| 723 | if (!after_bootmem) | ||
| 724 | init_gbpages(); | ||
| 725 | |||
| 726 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 727 | /* | ||
| 728 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
| 729 | * This will simplify cpa(), which otherwise needs to support splitting | ||
| 730 | * large pages into small in interrupt context, etc. | ||
| 731 | */ | ||
| 732 | use_pse = use_gbpages = 0; | ||
| 733 | #else | ||
| 734 | use_pse = cpu_has_pse; | ||
| 735 | use_gbpages = direct_gbpages; | ||
| 736 | #endif | ||
| 737 | |||
| 738 | if (use_gbpages) | ||
| 739 | page_size_mask |= 1 << PG_LEVEL_1G; | ||
| 740 | if (use_pse) | ||
| 741 | page_size_mask |= 1 << PG_LEVEL_2M; | ||
| 742 | |||
| 743 | memset(mr, 0, sizeof(mr)); | ||
| 744 | nr_range = 0; | ||
| 745 | |||
| 746 | /* head if not big page alignment ?*/ | ||
| 747 | start_pfn = start >> PAGE_SHIFT; | ||
| 748 | pos = start_pfn << PAGE_SHIFT; | ||
| 749 | end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) | ||
| 750 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 751 | if (end_pfn > (end >> PAGE_SHIFT)) | ||
| 752 | end_pfn = end >> PAGE_SHIFT; | ||
| 753 | if (start_pfn < end_pfn) { | ||
| 754 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 755 | pos = end_pfn << PAGE_SHIFT; | ||
| 756 | } | ||
| 757 | |||
| 758 | /* big page (2M) range*/ | ||
| 759 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 760 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 761 | end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 762 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 763 | if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) | ||
| 764 | end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); | ||
| 765 | if (start_pfn < end_pfn) { | ||
| 766 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 767 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 768 | pos = end_pfn << PAGE_SHIFT; | ||
| 769 | } | ||
| 770 | |||
| 771 | /* big page (1G) range */ | ||
| 772 | start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
| 773 | << (PUD_SHIFT - PAGE_SHIFT); | ||
| 774 | end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); | ||
| 775 | if (start_pfn < end_pfn) { | ||
| 776 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 777 | page_size_mask & | ||
| 778 | ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); | ||
| 779 | pos = end_pfn << PAGE_SHIFT; | ||
| 780 | } | ||
| 781 | |||
| 782 | /* tail is not big page (1G) alignment */ | ||
| 783 | start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
| 784 | << (PMD_SHIFT - PAGE_SHIFT); | ||
| 785 | end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
| 786 | if (start_pfn < end_pfn) { | ||
| 787 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
| 788 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
| 789 | pos = end_pfn << PAGE_SHIFT; | ||
| 790 | } | ||
| 791 | |||
| 792 | /* tail is not big page (2M) alignment */ | ||
| 793 | start_pfn = pos>>PAGE_SHIFT; | ||
| 794 | end_pfn = end>>PAGE_SHIFT; | ||
| 795 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
| 796 | |||
| 797 | /* try to merge same page size and continuous */ | ||
| 798 | for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { | ||
| 799 | unsigned long old_start; | ||
| 800 | if (mr[i].end != mr[i+1].start || | ||
| 801 | mr[i].page_size_mask != mr[i+1].page_size_mask) | ||
| 802 | continue; | ||
| 803 | /* move it */ | ||
| 804 | old_start = mr[i].start; | ||
| 805 | memmove(&mr[i], &mr[i+1], | ||
| 806 | (nr_range - 1 - i) * sizeof (struct map_range)); | ||
| 807 | mr[i--].start = old_start; | ||
| 808 | nr_range--; | ||
| 809 | } | ||
| 810 | |||
| 811 | for (i = 0; i < nr_range; i++) | ||
| 812 | printk(KERN_DEBUG " %010lx - %010lx page %s\n", | ||
| 813 | mr[i].start, mr[i].end, | ||
| 814 | (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( | ||
| 815 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | ||
| 816 | |||
| 817 | if (!after_bootmem) | ||
| 818 | find_early_table_space(end, use_pse, use_gbpages); | ||
| 819 | |||
| 820 | for (i = 0; i < nr_range; i++) | ||
| 821 | last_map_addr = kernel_physical_mapping_init( | ||
| 822 | mr[i].start, mr[i].end, | ||
| 823 | mr[i].page_size_mask); | ||
| 824 | |||
| 825 | if (!after_bootmem) | ||
| 826 | mmu_cr4_features = read_cr4(); | ||
| 827 | __flush_tlb_all(); | ||
| 828 | |||
| 829 | if (!after_bootmem && table_end > table_start) | ||
| 830 | reserve_early(table_start << PAGE_SHIFT, | ||
| 831 | table_end << PAGE_SHIFT, "PGTABLE"); | ||
| 832 | |||
| 833 | printk(KERN_INFO "last_map_addr: %lx end: %lx\n", | ||
| 834 | last_map_addr, end); | ||
| 835 | |||
| 836 | if (!after_bootmem) | ||
| 837 | early_memtest(start, end); | ||
| 838 | |||
| 839 | return last_map_addr >> PAGE_SHIFT; | ||
| 840 | } | ||
| 841 | |||
| 842 | #ifndef CONFIG_NUMA | 613 | #ifndef CONFIG_NUMA |
| 843 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 614 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
| 844 | { | 615 | { |
| @@ -910,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | |||
| 910 | 681 | ||
| 911 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 682 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
| 912 | 683 | ||
| 913 | /* | ||
| 914 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 915 | * is valid. The argument is a physical page number. | ||
| 916 | * | ||
| 917 | * | ||
| 918 | * On x86, access has to be given to the first megabyte of ram because that area | ||
| 919 | * contains bios code and data regions used by X and dosemu and similar apps. | ||
| 920 | * Access has to be given to non-kernel-ram areas as well, these contain the PCI | ||
| 921 | * mmio resources as well as potential bios/acpi data regions. | ||
| 922 | */ | ||
| 923 | int devmem_is_allowed(unsigned long pagenr) | ||
| 924 | { | ||
| 925 | if (pagenr <= 256) | ||
| 926 | return 1; | ||
| 927 | if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) | ||
| 928 | return 0; | ||
| 929 | if (!page_is_ram(pagenr)) | ||
| 930 | return 1; | ||
| 931 | return 0; | ||
| 932 | } | ||
| 933 | |||
| 934 | |||
| 935 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, | 684 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, |
| 936 | kcore_modules, kcore_vsyscall; | 685 | kcore_modules, kcore_vsyscall; |
| 937 | 686 | ||
| @@ -1019,13 +768,6 @@ void mark_rodata_ro(void) | |||
| 1019 | 768 | ||
| 1020 | #endif | 769 | #endif |
| 1021 | 770 | ||
| 1022 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 1023 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 1024 | { | ||
| 1025 | free_init_pages("initrd memory", start, end); | ||
| 1026 | } | ||
| 1027 | #endif | ||
| 1028 | |||
| 1029 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | 771 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, |
| 1030 | int flags) | 772 | int flags) |
| 1031 | { | 773 | { |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 433f7bd4648a..62773abdf088 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
| @@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x) | |||
| 38 | } else { | 38 | } else { |
| 39 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | 39 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); |
| 40 | x -= PAGE_OFFSET; | 40 | x -= PAGE_OFFSET; |
| 41 | VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : | 41 | VIRTUAL_BUG_ON(!phys_addr_valid(x)); |
| 42 | !phys_addr_valid(x)); | ||
| 43 | } | 42 | } |
| 44 | return x; | 43 | return x; |
| 45 | } | 44 | } |
| @@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x) | |||
| 56 | if (x < PAGE_OFFSET) | 55 | if (x < PAGE_OFFSET) |
| 57 | return false; | 56 | return false; |
| 58 | x -= PAGE_OFFSET; | 57 | x -= PAGE_OFFSET; |
| 59 | if (system_state == SYSTEM_BOOTING ? | 58 | if (!phys_addr_valid(x)) |
| 60 | x > MAXMEM : !phys_addr_valid(x)) { | ||
| 61 | return false; | 59 | return false; |
| 62 | } | ||
| 63 | } | 60 | } |
| 64 | 61 | ||
| 65 | return pfn_valid(x >> PAGE_SHIFT); | 62 | return pfn_valid(x >> PAGE_SHIFT); |
| @@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr) | |||
| 76 | #ifdef CONFIG_DEBUG_VIRTUAL | 73 | #ifdef CONFIG_DEBUG_VIRTUAL |
| 77 | unsigned long __phys_addr(unsigned long x) | 74 | unsigned long __phys_addr(unsigned long x) |
| 78 | { | 75 | { |
| 79 | /* VMALLOC_* aren't constants; not available at the boot time */ | 76 | /* VMALLOC_* aren't constants */ |
| 80 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | 77 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); |
| 81 | VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && | 78 | VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); |
| 82 | is_vmalloc_addr((void *) x)); | ||
| 83 | return x - PAGE_OFFSET; | 79 | return x - PAGE_OFFSET; |
| 84 | } | 80 | } |
| 85 | EXPORT_SYMBOL(__phys_addr); | 81 | EXPORT_SYMBOL(__phys_addr); |
| @@ -89,7 +85,7 @@ bool __virt_addr_valid(unsigned long x) | |||
| 89 | { | 85 | { |
| 90 | if (x < PAGE_OFFSET) | 86 | if (x < PAGE_OFFSET) |
| 91 | return false; | 87 | return false; |
| 92 | if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) | 88 | if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) |
| 93 | return false; | 89 | return false; |
| 94 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); | 90 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); |
| 95 | } | 91 | } |
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 93d82038af4b..9f205030d9aa 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c | |||
| @@ -32,11 +32,14 @@ struct kmmio_fault_page { | |||
| 32 | struct list_head list; | 32 | struct list_head list; |
| 33 | struct kmmio_fault_page *release_next; | 33 | struct kmmio_fault_page *release_next; |
| 34 | unsigned long page; /* location of the fault page */ | 34 | unsigned long page; /* location of the fault page */ |
| 35 | bool old_presence; /* page presence prior to arming */ | ||
| 36 | bool armed; | ||
| 35 | 37 | ||
| 36 | /* | 38 | /* |
| 37 | * Number of times this page has been registered as a part | 39 | * Number of times this page has been registered as a part |
| 38 | * of a probe. If zero, page is disarmed and this may be freed. | 40 | * of a probe. If zero, page is disarmed and this may be freed. |
| 39 | * Used only by writers (RCU). | 41 | * Used only by writers (RCU) and post_kmmio_handler(). |
| 42 | * Protected by kmmio_lock, when linked into kmmio_page_table. | ||
| 40 | */ | 43 | */ |
| 41 | int count; | 44 | int count; |
| 42 | }; | 45 | }; |
| @@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) | |||
| 105 | return NULL; | 108 | return NULL; |
| 106 | } | 109 | } |
| 107 | 110 | ||
| 108 | static void set_page_present(unsigned long addr, bool present, | 111 | static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) |
| 109 | unsigned int *pglevel) | 112 | { |
| 113 | pmdval_t v = pmd_val(*pmd); | ||
| 114 | *old = !!(v & _PAGE_PRESENT); | ||
| 115 | v &= ~_PAGE_PRESENT; | ||
| 116 | if (present) | ||
| 117 | v |= _PAGE_PRESENT; | ||
| 118 | set_pmd(pmd, __pmd(v)); | ||
| 119 | } | ||
| 120 | |||
| 121 | static void set_pte_presence(pte_t *pte, bool present, bool *old) | ||
| 122 | { | ||
| 123 | pteval_t v = pte_val(*pte); | ||
| 124 | *old = !!(v & _PAGE_PRESENT); | ||
| 125 | v &= ~_PAGE_PRESENT; | ||
| 126 | if (present) | ||
| 127 | v |= _PAGE_PRESENT; | ||
| 128 | set_pte_atomic(pte, __pte(v)); | ||
| 129 | } | ||
| 130 | |||
| 131 | static int set_page_presence(unsigned long addr, bool present, bool *old) | ||
| 110 | { | 132 | { |
| 111 | pteval_t pteval; | ||
| 112 | pmdval_t pmdval; | ||
| 113 | unsigned int level; | 133 | unsigned int level; |
| 114 | pmd_t *pmd; | ||
| 115 | pte_t *pte = lookup_address(addr, &level); | 134 | pte_t *pte = lookup_address(addr, &level); |
| 116 | 135 | ||
| 117 | if (!pte) { | 136 | if (!pte) { |
| 118 | pr_err("kmmio: no pte for page 0x%08lx\n", addr); | 137 | pr_err("kmmio: no pte for page 0x%08lx\n", addr); |
| 119 | return; | 138 | return -1; |
| 120 | } | 139 | } |
| 121 | 140 | ||
| 122 | if (pglevel) | ||
| 123 | *pglevel = level; | ||
| 124 | |||
| 125 | switch (level) { | 141 | switch (level) { |
| 126 | case PG_LEVEL_2M: | 142 | case PG_LEVEL_2M: |
| 127 | pmd = (pmd_t *)pte; | 143 | set_pmd_presence((pmd_t *)pte, present, old); |
| 128 | pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; | ||
| 129 | if (present) | ||
| 130 | pmdval |= _PAGE_PRESENT; | ||
| 131 | set_pmd(pmd, __pmd(pmdval)); | ||
| 132 | break; | 144 | break; |
| 133 | |||
| 134 | case PG_LEVEL_4K: | 145 | case PG_LEVEL_4K: |
| 135 | pteval = pte_val(*pte) & ~_PAGE_PRESENT; | 146 | set_pte_presence(pte, present, old); |
| 136 | if (present) | ||
| 137 | pteval |= _PAGE_PRESENT; | ||
| 138 | set_pte_atomic(pte, __pte(pteval)); | ||
| 139 | break; | 147 | break; |
| 140 | |||
| 141 | default: | 148 | default: |
| 142 | pr_err("kmmio: unexpected page level 0x%x.\n", level); | 149 | pr_err("kmmio: unexpected page level 0x%x.\n", level); |
| 143 | return; | 150 | return -1; |
| 144 | } | 151 | } |
| 145 | 152 | ||
| 146 | __flush_tlb_one(addr); | 153 | __flush_tlb_one(addr); |
| 154 | return 0; | ||
| 147 | } | 155 | } |
| 148 | 156 | ||
| 149 | /** Mark the given page as not present. Access to it will trigger a fault. */ | 157 | /* |
| 150 | static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) | 158 | * Mark the given page as not present. Access to it will trigger a fault. |
| 159 | * | ||
| 160 | * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the | ||
| 161 | * protection is ignored here. RCU read lock is assumed held, so the struct | ||
| 162 | * will not disappear unexpectedly. Furthermore, the caller must guarantee, | ||
| 163 | * that double arming the same virtual address (page) cannot occur. | ||
| 164 | * | ||
| 165 | * Double disarming on the other hand is allowed, and may occur when a fault | ||
| 166 | * and mmiotrace shutdown happen simultaneously. | ||
| 167 | */ | ||
| 168 | static int arm_kmmio_fault_page(struct kmmio_fault_page *f) | ||
| 151 | { | 169 | { |
| 152 | set_page_present(page & PAGE_MASK, false, pglevel); | 170 | int ret; |
| 171 | WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); | ||
| 172 | if (f->armed) { | ||
| 173 | pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", | ||
| 174 | f->page, f->count, f->old_presence); | ||
| 175 | } | ||
| 176 | ret = set_page_presence(f->page, false, &f->old_presence); | ||
| 177 | WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); | ||
| 178 | f->armed = true; | ||
| 179 | return ret; | ||
| 153 | } | 180 | } |
| 154 | 181 | ||
| 155 | /** Mark the given page as present. */ | 182 | /** Restore the given page to saved presence state. */ |
| 156 | static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) | 183 | static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) |
| 157 | { | 184 | { |
| 158 | set_page_present(page & PAGE_MASK, true, pglevel); | 185 | bool tmp; |
| 186 | int ret = set_page_presence(f->page, f->old_presence, &tmp); | ||
| 187 | WARN_ONCE(ret < 0, | ||
| 188 | KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); | ||
| 189 | f->armed = false; | ||
| 159 | } | 190 | } |
| 160 | 191 | ||
| 161 | /* | 192 | /* |
| @@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) | |||
| 202 | 233 | ||
| 203 | ctx = &get_cpu_var(kmmio_ctx); | 234 | ctx = &get_cpu_var(kmmio_ctx); |
| 204 | if (ctx->active) { | 235 | if (ctx->active) { |
| 205 | disarm_kmmio_fault_page(faultpage->page, NULL); | ||
| 206 | if (addr == ctx->addr) { | 236 | if (addr == ctx->addr) { |
| 207 | /* | 237 | /* |
| 208 | * On SMP we sometimes get recursive probe hits on the | 238 | * A second fault on the same page means some other |
| 209 | * same address. Context is already saved, fall out. | 239 | * condition needs handling by do_page_fault(), the |
| 240 | * page really not being present is the most common. | ||
| 210 | */ | 241 | */ |
| 211 | pr_debug("kmmio: duplicate probe hit on CPU %d, for " | 242 | pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", |
| 212 | "address 0x%08lx.\n", | 243 | addr, smp_processor_id()); |
| 213 | smp_processor_id(), addr); | 244 | |
| 214 | ret = 1; | 245 | if (!faultpage->old_presence) |
| 215 | goto no_kmmio_ctx; | 246 | pr_info("kmmio: unexpected secondary hit for " |
| 216 | } | 247 | "address 0x%08lx on CPU %d.\n", addr, |
| 217 | /* | 248 | smp_processor_id()); |
| 218 | * Prevent overwriting already in-flight context. | 249 | } else { |
| 219 | * This should not happen, let's hope disarming at least | 250 | /* |
| 220 | * prevents a panic. | 251 | * Prevent overwriting already in-flight context. |
| 221 | */ | 252 | * This should not happen, let's hope disarming at |
| 222 | pr_emerg("kmmio: recursive probe hit on CPU %d, " | 253 | * least prevents a panic. |
| 254 | */ | ||
| 255 | pr_emerg("kmmio: recursive probe hit on CPU %d, " | ||
| 223 | "for address 0x%08lx. Ignoring.\n", | 256 | "for address 0x%08lx. Ignoring.\n", |
| 224 | smp_processor_id(), addr); | 257 | smp_processor_id(), addr); |
| 225 | pr_emerg("kmmio: previous hit was at 0x%08lx.\n", | 258 | pr_emerg("kmmio: previous hit was at 0x%08lx.\n", |
| 226 | ctx->addr); | 259 | ctx->addr); |
| 260 | disarm_kmmio_fault_page(faultpage); | ||
| 261 | } | ||
| 227 | goto no_kmmio_ctx; | 262 | goto no_kmmio_ctx; |
| 228 | } | 263 | } |
| 229 | ctx->active++; | 264 | ctx->active++; |
| @@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) | |||
| 244 | regs->flags &= ~X86_EFLAGS_IF; | 279 | regs->flags &= ~X86_EFLAGS_IF; |
| 245 | 280 | ||
| 246 | /* Now we set present bit in PTE and single step. */ | 281 | /* Now we set present bit in PTE and single step. */ |
| 247 | disarm_kmmio_fault_page(ctx->fpage->page, NULL); | 282 | disarm_kmmio_fault_page(ctx->fpage); |
| 248 | 283 | ||
| 249 | /* | 284 | /* |
| 250 | * If another cpu accesses the same page while we are stepping, | 285 | * If another cpu accesses the same page while we are stepping, |
| @@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) | |||
| 275 | struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); | 310 | struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); |
| 276 | 311 | ||
| 277 | if (!ctx->active) { | 312 | if (!ctx->active) { |
| 278 | pr_debug("kmmio: spurious debug trap on CPU %d.\n", | 313 | pr_warning("kmmio: spurious debug trap on CPU %d.\n", |
| 279 | smp_processor_id()); | 314 | smp_processor_id()); |
| 280 | goto out; | 315 | goto out; |
| 281 | } | 316 | } |
| @@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) | |||
| 283 | if (ctx->probe && ctx->probe->post_handler) | 318 | if (ctx->probe && ctx->probe->post_handler) |
| 284 | ctx->probe->post_handler(ctx->probe, condition, regs); | 319 | ctx->probe->post_handler(ctx->probe, condition, regs); |
| 285 | 320 | ||
| 286 | arm_kmmio_fault_page(ctx->fpage->page, NULL); | 321 | /* Prevent racing against release_kmmio_fault_page(). */ |
| 322 | spin_lock(&kmmio_lock); | ||
| 323 | if (ctx->fpage->count) | ||
| 324 | arm_kmmio_fault_page(ctx->fpage); | ||
| 325 | spin_unlock(&kmmio_lock); | ||
| 287 | 326 | ||
| 288 | regs->flags &= ~X86_EFLAGS_TF; | 327 | regs->flags &= ~X86_EFLAGS_TF; |
| 289 | regs->flags |= ctx->saved_flags; | 328 | regs->flags |= ctx->saved_flags; |
| @@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page) | |||
| 315 | f = get_kmmio_fault_page(page); | 354 | f = get_kmmio_fault_page(page); |
| 316 | if (f) { | 355 | if (f) { |
| 317 | if (!f->count) | 356 | if (!f->count) |
| 318 | arm_kmmio_fault_page(f->page, NULL); | 357 | arm_kmmio_fault_page(f); |
| 319 | f->count++; | 358 | f->count++; |
| 320 | return 0; | 359 | return 0; |
| 321 | } | 360 | } |
| 322 | 361 | ||
| 323 | f = kmalloc(sizeof(*f), GFP_ATOMIC); | 362 | f = kzalloc(sizeof(*f), GFP_ATOMIC); |
| 324 | if (!f) | 363 | if (!f) |
| 325 | return -1; | 364 | return -1; |
| 326 | 365 | ||
| 327 | f->count = 1; | 366 | f->count = 1; |
| 328 | f->page = page; | 367 | f->page = page; |
| 329 | list_add_rcu(&f->list, kmmio_page_list(f->page)); | ||
| 330 | 368 | ||
| 331 | arm_kmmio_fault_page(f->page, NULL); | 369 | if (arm_kmmio_fault_page(f)) { |
| 370 | kfree(f); | ||
| 371 | return -1; | ||
| 372 | } | ||
| 373 | |||
| 374 | list_add_rcu(&f->list, kmmio_page_list(f->page)); | ||
| 332 | 375 | ||
| 333 | return 0; | 376 | return 0; |
| 334 | } | 377 | } |
| @@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page, | |||
| 347 | f->count--; | 390 | f->count--; |
| 348 | BUG_ON(f->count < 0); | 391 | BUG_ON(f->count < 0); |
| 349 | if (!f->count) { | 392 | if (!f->count) { |
| 350 | disarm_kmmio_fault_page(f->page, NULL); | 393 | disarm_kmmio_fault_page(f); |
| 351 | f->release_next = *release_list; | 394 | f->release_next = *release_list; |
| 352 | *release_list = f; | 395 | *release_list = f; |
| 353 | } | 396 | } |
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 451fe95a0352..3daefa04ace5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
| @@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn, | |||
| 416 | for_each_online_node(nid) | 416 | for_each_online_node(nid) |
| 417 | propagate_e820_map_node(nid); | 417 | propagate_e820_map_node(nid); |
| 418 | 418 | ||
| 419 | for_each_online_node(nid) | 419 | for_each_online_node(nid) { |
| 420 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | 420 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); |
| 421 | NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; | ||
| 422 | } | ||
| 421 | 423 | ||
| 422 | NODE_DATA(0)->bdata = &bootmem_node_data[0]; | ||
| 423 | setup_bootmem_allocator(); | 424 | setup_bootmem_allocator(); |
| 424 | } | 425 | } |
| 425 | 426 | ||
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index ab50a8d7402c..427fd1b56df5 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Written by Pekka Paalanen, 2008 <pq@iki.fi> | 2 | * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> |
| 3 | */ | 3 | */ |
| 4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
| 5 | #include <linux/io.h> | 5 | #include <linux/io.h> |
| @@ -9,35 +9,74 @@ | |||
| 9 | 9 | ||
| 10 | static unsigned long mmio_address; | 10 | static unsigned long mmio_address; |
| 11 | module_param(mmio_address, ulong, 0); | 11 | module_param(mmio_address, ulong, 0); |
| 12 | MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); | 12 | MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " |
| 13 | "(or 8 MB if read_far is non-zero)."); | ||
| 14 | |||
| 15 | static unsigned long read_far = 0x400100; | ||
| 16 | module_param(read_far, ulong, 0); | ||
| 17 | MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " | ||
| 18 | "(default: 0x400100)."); | ||
| 19 | |||
| 20 | static unsigned v16(unsigned i) | ||
| 21 | { | ||
| 22 | return i * 12 + 7; | ||
| 23 | } | ||
| 24 | |||
| 25 | static unsigned v32(unsigned i) | ||
| 26 | { | ||
| 27 | return i * 212371 + 13; | ||
| 28 | } | ||
| 13 | 29 | ||
| 14 | static void do_write_test(void __iomem *p) | 30 | static void do_write_test(void __iomem *p) |
| 15 | { | 31 | { |
| 16 | unsigned int i; | 32 | unsigned int i; |
| 33 | pr_info(MODULE_NAME ": write test.\n"); | ||
| 17 | mmiotrace_printk("Write test.\n"); | 34 | mmiotrace_printk("Write test.\n"); |
| 35 | |||
| 18 | for (i = 0; i < 256; i++) | 36 | for (i = 0; i < 256; i++) |
| 19 | iowrite8(i, p + i); | 37 | iowrite8(i, p + i); |
| 38 | |||
| 20 | for (i = 1024; i < (5 * 1024); i += 2) | 39 | for (i = 1024; i < (5 * 1024); i += 2) |
| 21 | iowrite16(i * 12 + 7, p + i); | 40 | iowrite16(v16(i), p + i); |
| 41 | |||
| 22 | for (i = (5 * 1024); i < (16 * 1024); i += 4) | 42 | for (i = (5 * 1024); i < (16 * 1024); i += 4) |
| 23 | iowrite32(i * 212371 + 13, p + i); | 43 | iowrite32(v32(i), p + i); |
| 24 | } | 44 | } |
| 25 | 45 | ||
| 26 | static void do_read_test(void __iomem *p) | 46 | static void do_read_test(void __iomem *p) |
| 27 | { | 47 | { |
| 28 | unsigned int i; | 48 | unsigned int i; |
| 49 | unsigned errs[3] = { 0 }; | ||
| 50 | pr_info(MODULE_NAME ": read test.\n"); | ||
| 29 | mmiotrace_printk("Read test.\n"); | 51 | mmiotrace_printk("Read test.\n"); |
| 52 | |||
| 30 | for (i = 0; i < 256; i++) | 53 | for (i = 0; i < 256; i++) |
| 31 | ioread8(p + i); | 54 | if (ioread8(p + i) != i) |
| 55 | ++errs[0]; | ||
| 56 | |||
| 32 | for (i = 1024; i < (5 * 1024); i += 2) | 57 | for (i = 1024; i < (5 * 1024); i += 2) |
| 33 | ioread16(p + i); | 58 | if (ioread16(p + i) != v16(i)) |
| 59 | ++errs[1]; | ||
| 60 | |||
| 34 | for (i = (5 * 1024); i < (16 * 1024); i += 4) | 61 | for (i = (5 * 1024); i < (16 * 1024); i += 4) |
| 35 | ioread32(p + i); | 62 | if (ioread32(p + i) != v32(i)) |
| 63 | ++errs[2]; | ||
| 64 | |||
| 65 | mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", | ||
| 66 | errs[0], errs[1], errs[2]); | ||
| 36 | } | 67 | } |
| 37 | 68 | ||
| 38 | static void do_test(void) | 69 | static void do_read_far_test(void __iomem *p) |
| 39 | { | 70 | { |
| 40 | void __iomem *p = ioremap_nocache(mmio_address, 0x4000); | 71 | pr_info(MODULE_NAME ": read far test.\n"); |
| 72 | mmiotrace_printk("Read far test.\n"); | ||
| 73 | |||
| 74 | ioread32(p + read_far); | ||
| 75 | } | ||
| 76 | |||
| 77 | static void do_test(unsigned long size) | ||
| 78 | { | ||
| 79 | void __iomem *p = ioremap_nocache(mmio_address, size); | ||
| 41 | if (!p) { | 80 | if (!p) { |
| 42 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); | 81 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); |
| 43 | return; | 82 | return; |
| @@ -45,11 +84,15 @@ static void do_test(void) | |||
| 45 | mmiotrace_printk("ioremap returned %p.\n", p); | 84 | mmiotrace_printk("ioremap returned %p.\n", p); |
| 46 | do_write_test(p); | 85 | do_write_test(p); |
| 47 | do_read_test(p); | 86 | do_read_test(p); |
| 87 | if (read_far && read_far < size - 4) | ||
| 88 | do_read_far_test(p); | ||
| 48 | iounmap(p); | 89 | iounmap(p); |
| 49 | } | 90 | } |
| 50 | 91 | ||
| 51 | static int __init init(void) | 92 | static int __init init(void) |
| 52 | { | 93 | { |
| 94 | unsigned long size = (read_far) ? (8 << 20) : (16 << 10); | ||
| 95 | |||
| 53 | if (mmio_address == 0) { | 96 | if (mmio_address == 0) { |
| 54 | pr_err(MODULE_NAME ": you have to use the module argument " | 97 | pr_err(MODULE_NAME ": you have to use the module argument " |
| 55 | "mmio_address.\n"); | 98 | "mmio_address.\n"); |
| @@ -58,10 +101,11 @@ static int __init init(void) | |||
| 58 | return -ENXIO; | 101 | return -ENXIO; |
| 59 | } | 102 | } |
| 60 | 103 | ||
| 61 | pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " | 104 | pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " |
| 62 | "in PCI address space, and writing " | 105 | "address space, and writing 16 kB of rubbish in there.\n", |
| 63 | "rubbish in there.\n", mmio_address); | 106 | size >> 10, mmio_address); |
| 64 | do_test(); | 107 | do_test(size); |
| 108 | pr_info(MODULE_NAME ": All done.\n"); | ||
| 65 | return 0; | 109 | return 0; |
| 66 | } | 110 | } |
| 67 | 111 | ||
diff --git a/crypto/api.c b/crypto/api.c index efe77df6863f..38a2bc02a98c 100644 --- a/crypto/api.c +++ b/crypto/api.c | |||
| @@ -215,8 +215,19 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) | |||
| 215 | mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); | 215 | mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); |
| 216 | type &= mask; | 216 | type &= mask; |
| 217 | 217 | ||
| 218 | alg = try_then_request_module(crypto_alg_lookup(name, type, mask), | 218 | alg = crypto_alg_lookup(name, type, mask); |
| 219 | name); | 219 | if (!alg) { |
| 220 | char tmp[CRYPTO_MAX_ALG_NAME]; | ||
| 221 | |||
| 222 | request_module(name); | ||
| 223 | |||
| 224 | if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) && | ||
| 225 | snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp)) | ||
| 226 | request_module(tmp); | ||
| 227 | |||
| 228 | alg = crypto_alg_lookup(name, type, mask); | ||
| 229 | } | ||
| 230 | |||
| 220 | if (alg) | 231 | if (alg) |
| 221 | return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg; | 232 | return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg; |
| 222 | 233 | ||
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index 2d637e0fbc03..d9e751be8c5f 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c | |||
| @@ -457,10 +457,12 @@ static int init_ixp_crypto(void) | |||
| 457 | if (!ctx_pool) { | 457 | if (!ctx_pool) { |
| 458 | goto err; | 458 | goto err; |
| 459 | } | 459 | } |
| 460 | ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0); | 460 | ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0, |
| 461 | "ixp_crypto:out", NULL); | ||
| 461 | if (ret) | 462 | if (ret) |
| 462 | goto err; | 463 | goto err; |
| 463 | ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0); | 464 | ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0, |
| 465 | "ixp_crypto:in", NULL); | ||
| 464 | if (ret) { | 466 | if (ret) { |
| 465 | qmgr_release_queue(SEND_QID); | 467 | qmgr_release_queue(SEND_QID); |
| 466 | goto err; | 468 | goto err; |
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c index 856b3cc25583..3f0fdd18255d 100644 --- a/drivers/crypto/padlock-aes.c +++ b/drivers/crypto/padlock-aes.c | |||
| @@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support"); | |||
| 489 | MODULE_LICENSE("GPL"); | 489 | MODULE_LICENSE("GPL"); |
| 490 | MODULE_AUTHOR("Michal Ludvig"); | 490 | MODULE_AUTHOR("Michal Ludvig"); |
| 491 | 491 | ||
| 492 | MODULE_ALIAS("aes"); | 492 | MODULE_ALIAS("aes-all"); |
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c index a7fbadebf623..a2c8e8514b63 100644 --- a/drivers/crypto/padlock-sha.c +++ b/drivers/crypto/padlock-sha.c | |||
| @@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support."); | |||
| 304 | MODULE_LICENSE("GPL"); | 304 | MODULE_LICENSE("GPL"); |
| 305 | MODULE_AUTHOR("Michal Ludvig"); | 305 | MODULE_AUTHOR("Michal Ludvig"); |
| 306 | 306 | ||
| 307 | MODULE_ALIAS("sha1"); | 307 | MODULE_ALIAS("sha1-all"); |
| 308 | MODULE_ALIAS("sha256"); | 308 | MODULE_ALIAS("sha256-all"); |
| 309 | MODULE_ALIAS("sha1-padlock"); | 309 | MODULE_ALIAS("sha1-padlock"); |
| 310 | MODULE_ALIAS("sha256-padlock"); | 310 | MODULE_ALIAS("sha256-padlock"); |
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index ea5440dd10dc..647374acba94 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c | |||
| @@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma"); | |||
| 1401 | 1401 | ||
| 1402 | static struct platform_driver iop_adma_driver = { | 1402 | static struct platform_driver iop_adma_driver = { |
| 1403 | .probe = iop_adma_probe, | 1403 | .probe = iop_adma_probe, |
| 1404 | .remove = iop_adma_remove, | 1404 | .remove = __devexit_p(iop_adma_remove), |
| 1405 | .driver = { | 1405 | .driver = { |
| 1406 | .owner = THIS_MODULE, | 1406 | .owner = THIS_MODULE, |
| 1407 | .name = "iop-adma", | 1407 | .name = "iop-adma", |
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index d35cbd1ff0b3..5d5d5b31867f 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c | |||
| @@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp, | |||
| 1287 | 1287 | ||
| 1288 | static struct platform_driver mv_xor_driver = { | 1288 | static struct platform_driver mv_xor_driver = { |
| 1289 | .probe = mv_xor_probe, | 1289 | .probe = mv_xor_probe, |
| 1290 | .remove = mv_xor_remove, | 1290 | .remove = __devexit_p(mv_xor_remove), |
| 1291 | .driver = { | 1291 | .driver = { |
| 1292 | .owner = THIS_MODULE, | 1292 | .owner = THIS_MODULE, |
| 1293 | .name = MV_XOR_NAME, | 1293 | .name = MV_XOR_NAME, |
diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c index 096e2a37446d..7c8b15b22bf2 100644 --- a/drivers/gpu/drm/drm_stub.c +++ b/drivers/gpu/drm/drm_stub.c | |||
| @@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data, | |||
| 168 | file_priv->minor->master != file_priv->master) { | 168 | file_priv->minor->master != file_priv->master) { |
| 169 | mutex_lock(&dev->struct_mutex); | 169 | mutex_lock(&dev->struct_mutex); |
| 170 | file_priv->minor->master = drm_master_get(file_priv->master); | 170 | file_priv->minor->master = drm_master_get(file_priv->master); |
| 171 | mutex_lock(&dev->struct_mutex); | 171 | mutex_unlock(&dev->struct_mutex); |
| 172 | } | 172 | } |
| 173 | 173 | ||
| 174 | return 0; | 174 | return 0; |
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index eeda276f8f16..7f186bbcb99d 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c | |||
| @@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd, | |||
| 482 | return 0; | 482 | return 0; |
| 483 | } | 483 | } |
| 484 | 484 | ||
| 485 | static void __devexit | 485 | static void |
| 486 | mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data) | 486 | mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data) |
| 487 | { | 487 | { |
| 488 | if (drv_data->reg_base) { | 488 | if (drv_data->reg_base) { |
| @@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev) | |||
| 577 | 577 | ||
| 578 | static struct platform_driver mv64xxx_i2c_driver = { | 578 | static struct platform_driver mv64xxx_i2c_driver = { |
| 579 | .probe = mv64xxx_i2c_probe, | 579 | .probe = mv64xxx_i2c_probe, |
| 580 | .remove = mv64xxx_i2c_remove, | 580 | .remove = __devexit_p(mv64xxx_i2c_remove), |
| 581 | .driver = { | 581 | .driver = { |
| 582 | .owner = THIS_MODULE, | 582 | .owner = THIS_MODULE, |
| 583 | .name = MV64XXX_I2C_CTLR_NAME, | 583 | .name = MV64XXX_I2C_CTLR_NAME, |
diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c index 917cf8d3ae95..c2dfd3ea353d 100644 --- a/drivers/mtd/nand/orion_nand.c +++ b/drivers/mtd/nand/orion_nand.c | |||
| @@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev) | |||
| 149 | 149 | ||
| 150 | static struct platform_driver orion_nand_driver = { | 150 | static struct platform_driver orion_nand_driver = { |
| 151 | .probe = orion_nand_probe, | 151 | .probe = orion_nand_probe, |
| 152 | .remove = orion_nand_remove, | 152 | .remove = __devexit_p(orion_nand_remove), |
| 153 | .driver = { | 153 | .driver = { |
| 154 | .name = "orion_nand", | 154 | .name = "orion_nand", |
| 155 | .owner = THIS_MODULE, | 155 | .owner = THIS_MODULE, |
diff --git a/drivers/net/arm/Makefile b/drivers/net/arm/Makefile index c69c0cdba4a2..811a3ccd14c1 100644 --- a/drivers/net/arm/Makefile +++ b/drivers/net/arm/Makefile | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | # | 4 | # |
| 5 | 5 | ||
| 6 | obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o | 6 | obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o |
| 7 | obj-$(CONFIG_ARM_ETHERH) += etherh.o ../8390.o | 7 | obj-$(CONFIG_ARM_ETHERH) += etherh.o |
| 8 | obj-$(CONFIG_ARM_ETHER3) += ether3.o | 8 | obj-$(CONFIG_ARM_ETHER3) += ether3.o |
| 9 | obj-$(CONFIG_ARM_ETHER1) += ether1.o | 9 | obj-$(CONFIG_ARM_ETHER1) += ether1.o |
| 10 | obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o | 10 | obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o |
diff --git a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c index 54b52e5b1821..f52f668c49bf 100644 --- a/drivers/net/arm/etherh.c +++ b/drivers/net/arm/etherh.c | |||
| @@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = { | |||
| 641 | .ndo_open = etherh_open, | 641 | .ndo_open = etherh_open, |
| 642 | .ndo_stop = etherh_close, | 642 | .ndo_stop = etherh_close, |
| 643 | .ndo_set_config = etherh_set_config, | 643 | .ndo_set_config = etherh_set_config, |
| 644 | .ndo_start_xmit = ei_start_xmit, | 644 | .ndo_start_xmit = __ei_start_xmit, |
| 645 | .ndo_tx_timeout = ei_tx_timeout, | 645 | .ndo_tx_timeout = __ei_tx_timeout, |
| 646 | .ndo_get_stats = ei_get_stats, | 646 | .ndo_get_stats = __ei_get_stats, |
| 647 | .ndo_set_multicast_list = ei_set_multicast_list, | 647 | .ndo_set_multicast_list = __ei_set_multicast_list, |
| 648 | .ndo_validate_addr = eth_validate_addr, | 648 | .ndo_validate_addr = eth_validate_addr, |
| 649 | .ndo_set_mac_address = eth_mac_addr, | 649 | .ndo_set_mac_address = eth_mac_addr, |
| 650 | .ndo_change_mtu = eth_change_mtu, | 650 | .ndo_change_mtu = eth_change_mtu, |
| 651 | #ifdef CONFIG_NET_POLL_CONTROLLER | 651 | #ifdef CONFIG_NET_POLL_CONTROLLER |
| 652 | .ndo_poll_controller = ei_poll, | 652 | .ndo_poll_controller = __ei_poll, |
| 653 | #endif | 653 | #endif |
| 654 | }; | 654 | }; |
| 655 | 655 | ||
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c index 48ff701d3a72..2552b9f325ee 100644 --- a/drivers/video/pxafb.c +++ b/drivers/video/pxafb.c | |||
| @@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev) | |||
| 2230 | 2230 | ||
| 2231 | static struct platform_driver pxafb_driver = { | 2231 | static struct platform_driver pxafb_driver = { |
| 2232 | .probe = pxafb_probe, | 2232 | .probe = pxafb_probe, |
| 2233 | .remove = pxafb_remove, | 2233 | .remove = __devexit_p(pxafb_remove), |
| 2234 | .suspend = pxafb_suspend, | 2234 | .suspend = pxafb_suspend, |
| 2235 | .resume = pxafb_resume, | 2235 | .resume = pxafb_resume, |
| 2236 | .driver = { | 2236 | .driver = { |
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index f3f697df1d71..80044a4f3ab9 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h | |||
| @@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void); | |||
| 181 | #define rcu_enter_nohz() do { } while (0) | 181 | #define rcu_enter_nohz() do { } while (0) |
| 182 | #define rcu_exit_nohz() do { } while (0) | 182 | #define rcu_exit_nohz() do { } while (0) |
| 183 | 183 | ||
| 184 | /* A context switch is a grace period for rcuclassic. */ | ||
| 185 | static inline int rcu_blocking_is_gp(void) | ||
| 186 | { | ||
| 187 | return num_online_cpus() == 1; | ||
| 188 | } | ||
| 189 | |||
| 184 | #endif /* __LINUX_RCUCLASSIC_H */ | 190 | #endif /* __LINUX_RCUCLASSIC_H */ |
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 921340a7b71c..528343e6da51 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
| @@ -52,6 +52,9 @@ struct rcu_head { | |||
| 52 | void (*func)(struct rcu_head *head); | 52 | void (*func)(struct rcu_head *head); |
| 53 | }; | 53 | }; |
| 54 | 54 | ||
| 55 | /* Internal to kernel, but needed by rcupreempt.h. */ | ||
| 56 | extern int rcu_scheduler_active; | ||
| 57 | |||
| 55 | #if defined(CONFIG_CLASSIC_RCU) | 58 | #if defined(CONFIG_CLASSIC_RCU) |
| 56 | #include <linux/rcuclassic.h> | 59 | #include <linux/rcuclassic.h> |
| 57 | #elif defined(CONFIG_TREE_RCU) | 60 | #elif defined(CONFIG_TREE_RCU) |
| @@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void); | |||
| 265 | 268 | ||
| 266 | /* Internal to kernel */ | 269 | /* Internal to kernel */ |
| 267 | extern void rcu_init(void); | 270 | extern void rcu_init(void); |
| 271 | extern void rcu_scheduler_starting(void); | ||
| 268 | extern int rcu_needs_cpu(int cpu); | 272 | extern int rcu_needs_cpu(int cpu); |
| 269 | 273 | ||
| 270 | #endif /* __LINUX_RCUPDATE_H */ | 274 | #endif /* __LINUX_RCUPDATE_H */ |
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 3e05c09b54a2..74304b4538d8 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h | |||
| @@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void) | |||
| 142 | #define rcu_exit_nohz() do { } while (0) | 142 | #define rcu_exit_nohz() do { } while (0) |
| 143 | #endif /* CONFIG_NO_HZ */ | 143 | #endif /* CONFIG_NO_HZ */ |
| 144 | 144 | ||
| 145 | /* | ||
| 146 | * A context switch is a grace period for rcupreempt synchronize_rcu() | ||
| 147 | * only during early boot, before the scheduler has been initialized. | ||
| 148 | * So, how the heck do we get a context switch? Well, if the caller | ||
| 149 | * invokes synchronize_rcu(), they are willing to accept a context | ||
| 150 | * switch, so we simply pretend that one happened. | ||
| 151 | * | ||
| 152 | * After boot, there might be a blocked or preempted task in an RCU | ||
| 153 | * read-side critical section, so we cannot then take the fastpath. | ||
| 154 | */ | ||
| 155 | static inline int rcu_blocking_is_gp(void) | ||
| 156 | { | ||
| 157 | return num_online_cpus() == 1 && !rcu_scheduler_active; | ||
| 158 | } | ||
| 159 | |||
| 145 | #endif /* __LINUX_RCUPREEMPT_H */ | 160 | #endif /* __LINUX_RCUPREEMPT_H */ |
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d4368b7975c3..a722fb67bb2d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h | |||
| @@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void) | |||
| 326 | } | 326 | } |
| 327 | #endif /* CONFIG_NO_HZ */ | 327 | #endif /* CONFIG_NO_HZ */ |
| 328 | 328 | ||
| 329 | /* A context switch is a grace period for rcutree. */ | ||
| 330 | static inline int rcu_blocking_is_gp(void) | ||
| 331 | { | ||
| 332 | return num_online_cpus() == 1; | ||
| 333 | } | ||
| 334 | |||
| 329 | #endif /* __LINUX_RCUTREE_H */ | 335 | #endif /* __LINUX_RCUTREE_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index f0a50b20e8a0..a7c7698583bb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -2303,9 +2303,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); | |||
| 2303 | extern int sched_group_set_rt_period(struct task_group *tg, | 2303 | extern int sched_group_set_rt_period(struct task_group *tg, |
| 2304 | long rt_period_us); | 2304 | long rt_period_us); |
| 2305 | extern long sched_group_rt_period(struct task_group *tg); | 2305 | extern long sched_group_rt_period(struct task_group *tg); |
| 2306 | extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); | ||
| 2306 | #endif | 2307 | #endif |
| 2307 | #endif | 2308 | #endif |
| 2308 | 2309 | ||
| 2310 | extern int task_can_switch_user(struct user_struct *up, | ||
| 2311 | struct task_struct *tsk); | ||
| 2312 | |||
| 2309 | #ifdef CONFIG_TASK_XACCT | 2313 | #ifdef CONFIG_TASK_XACCT |
| 2310 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 2314 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
| 2311 | { | 2315 | { |
diff --git a/init/main.c b/init/main.c index 6441083f8273..6bf83afd654d 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -98,7 +98,7 @@ static inline void mark_rodata_ro(void) { } | |||
| 98 | extern void tc_init(void); | 98 | extern void tc_init(void); |
| 99 | #endif | 99 | #endif |
| 100 | 100 | ||
| 101 | enum system_states system_state; | 101 | enum system_states system_state __read_mostly; |
| 102 | EXPORT_SYMBOL(system_state); | 102 | EXPORT_SYMBOL(system_state); |
| 103 | 103 | ||
| 104 | /* | 104 | /* |
| @@ -464,6 +464,7 @@ static noinline void __init_refok rest_init(void) | |||
| 464 | * at least once to get things moving: | 464 | * at least once to get things moving: |
| 465 | */ | 465 | */ |
| 466 | init_idle_bootup_task(current); | 466 | init_idle_bootup_task(current); |
| 467 | rcu_scheduler_starting(); | ||
| 467 | preempt_enable_no_resched(); | 468 | preempt_enable_no_resched(); |
| 468 | schedule(); | 469 | schedule(); |
| 469 | preempt_disable(); | 470 | preempt_disable(); |
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index bd5a9003497c..654c640a6b9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c | |||
| @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) | |||
| 679 | void rcu_check_callbacks(int cpu, int user) | 679 | void rcu_check_callbacks(int cpu, int user) |
| 680 | { | 680 | { |
| 681 | if (user || | 681 | if (user || |
| 682 | (idle_cpu(cpu) && !in_softirq() && | 682 | (idle_cpu(cpu) && rcu_scheduler_active && |
| 683 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | 683 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { |
| 684 | 684 | ||
| 685 | /* | 685 | /* |
| 686 | * Get here if this CPU took its interrupt from user | 686 | * Get here if this CPU took its interrupt from user |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index d92a76a881aa..cae8a059cf47 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -44,6 +44,7 @@ | |||
| 44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
| 45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
| 46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
| 47 | #include <linux/kernel_stat.h> | ||
| 47 | 48 | ||
| 48 | enum rcu_barrier { | 49 | enum rcu_barrier { |
| 49 | RCU_BARRIER_STD, | 50 | RCU_BARRIER_STD, |
| @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | |||
| 55 | static atomic_t rcu_barrier_cpu_count; | 56 | static atomic_t rcu_barrier_cpu_count; |
| 56 | static DEFINE_MUTEX(rcu_barrier_mutex); | 57 | static DEFINE_MUTEX(rcu_barrier_mutex); |
| 57 | static struct completion rcu_barrier_completion; | 58 | static struct completion rcu_barrier_completion; |
| 59 | int rcu_scheduler_active __read_mostly; | ||
| 58 | 60 | ||
| 59 | /* | 61 | /* |
| 60 | * Awaken the corresponding synchronize_rcu() instance now that a | 62 | * Awaken the corresponding synchronize_rcu() instance now that a |
| @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head) | |||
| 80 | void synchronize_rcu(void) | 82 | void synchronize_rcu(void) |
| 81 | { | 83 | { |
| 82 | struct rcu_synchronize rcu; | 84 | struct rcu_synchronize rcu; |
| 85 | |||
| 86 | if (rcu_blocking_is_gp()) | ||
| 87 | return; | ||
| 88 | |||
| 83 | init_completion(&rcu.completion); | 89 | init_completion(&rcu.completion); |
| 84 | /* Will wake me after RCU finished. */ | 90 | /* Will wake me after RCU finished. */ |
| 85 | call_rcu(&rcu.head, wakeme_after_rcu); | 91 | call_rcu(&rcu.head, wakeme_after_rcu); |
| @@ -175,3 +181,9 @@ void __init rcu_init(void) | |||
| 175 | __rcu_init(); | 181 | __rcu_init(); |
| 176 | } | 182 | } |
| 177 | 183 | ||
| 184 | void rcu_scheduler_starting(void) | ||
| 185 | { | ||
| 186 | WARN_ON(num_online_cpus() != 1); | ||
| 187 | WARN_ON(nr_context_switches() > 0); | ||
| 188 | rcu_scheduler_active = 1; | ||
| 189 | } | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 33cfc50781f9..5d59e850fb71 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
| @@ -1181,6 +1181,9 @@ void __synchronize_sched(void) | |||
| 1181 | { | 1181 | { |
| 1182 | struct rcu_synchronize rcu; | 1182 | struct rcu_synchronize rcu; |
| 1183 | 1183 | ||
| 1184 | if (num_online_cpus() == 1) | ||
| 1185 | return; /* blocking is gp if only one CPU! */ | ||
| 1186 | |||
| 1184 | init_completion(&rcu.completion); | 1187 | init_completion(&rcu.completion); |
| 1185 | /* Will wake me after RCU finished. */ | 1188 | /* Will wake me after RCU finished. */ |
| 1186 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | 1189 | call_rcu_sched(&rcu.head, wakeme_after_rcu); |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fd602a6f6f..97ce31579ec0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 948 | void rcu_check_callbacks(int cpu, int user) | 948 | void rcu_check_callbacks(int cpu, int user) |
| 949 | { | 949 | { |
| 950 | if (user || | 950 | if (user || |
| 951 | (idle_cpu(cpu) && !in_softirq() && | 951 | (idle_cpu(cpu) && rcu_scheduler_active && |
| 952 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | 952 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { |
| 953 | 953 | ||
| 954 | /* | 954 | /* |
| 955 | * Get here if this CPU took its interrupt from user | 955 | * Get here if this CPU took its interrupt from user |
diff --git a/kernel/sched.c b/kernel/sched.c index 0e5c38e1c8b5..0a76d0b6f215 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 223 | { | 223 | { |
| 224 | ktime_t now; | 224 | ktime_t now; |
| 225 | 225 | ||
| 226 | if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) | 226 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
| 227 | return; | 227 | return; |
| 228 | 228 | ||
| 229 | if (hrtimer_active(&rt_b->rt_period_timer)) | 229 | if (hrtimer_active(&rt_b->rt_period_timer)) |
| @@ -9219,6 +9219,16 @@ static int sched_rt_global_constraints(void) | |||
| 9219 | 9219 | ||
| 9220 | return ret; | 9220 | return ret; |
| 9221 | } | 9221 | } |
| 9222 | |||
| 9223 | int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | ||
| 9224 | { | ||
| 9225 | /* Don't accept realtime tasks when there is no way for them to run */ | ||
| 9226 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) | ||
| 9227 | return 0; | ||
| 9228 | |||
| 9229 | return 1; | ||
| 9230 | } | ||
| 9231 | |||
| 9222 | #else /* !CONFIG_RT_GROUP_SCHED */ | 9232 | #else /* !CONFIG_RT_GROUP_SCHED */ |
| 9223 | static int sched_rt_global_constraints(void) | 9233 | static int sched_rt_global_constraints(void) |
| 9224 | { | 9234 | { |
| @@ -9312,8 +9322,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 9312 | struct task_struct *tsk) | 9322 | struct task_struct *tsk) |
| 9313 | { | 9323 | { |
| 9314 | #ifdef CONFIG_RT_GROUP_SCHED | 9324 | #ifdef CONFIG_RT_GROUP_SCHED |
| 9315 | /* Don't accept realtime tasks when there is no way for them to run */ | 9325 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) |
| 9316 | if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) | ||
| 9317 | return -EINVAL; | 9326 | return -EINVAL; |
| 9318 | #else | 9327 | #else |
| 9319 | /* We don't support RT-tasks being in separate groups */ | 9328 | /* We don't support RT-tasks being in separate groups */ |
diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -559,7 +559,7 @@ error: | |||
| 559 | abort_creds(new); | 559 | abort_creds(new); |
| 560 | return retval; | 560 | return retval; |
| 561 | } | 561 | } |
| 562 | 562 | ||
| 563 | /* | 563 | /* |
| 564 | * change the user struct in a credentials set to match the new UID | 564 | * change the user struct in a credentials set to match the new UID |
| 565 | */ | 565 | */ |
| @@ -571,6 +571,11 @@ static int set_user(struct cred *new) | |||
| 571 | if (!new_user) | 571 | if (!new_user) |
| 572 | return -EAGAIN; | 572 | return -EAGAIN; |
| 573 | 573 | ||
| 574 | if (!task_can_switch_user(new_user, current)) { | ||
| 575 | free_uid(new_user); | ||
| 576 | return -EINVAL; | ||
| 577 | } | ||
| 578 | |||
| 574 | if (atomic_read(&new_user->processes) >= | 579 | if (atomic_read(&new_user->processes) >= |
| 575 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && | 580 | current->signal->rlim[RLIMIT_NPROC].rlim_cur && |
| 576 | new_user != INIT_USER) { | 581 | new_user != INIT_USER) { |
| @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
| 631 | goto error; | 636 | goto error; |
| 632 | } | 637 | } |
| 633 | 638 | ||
| 634 | retval = -EAGAIN; | 639 | if (new->uid != old->uid) { |
| 635 | if (new->uid != old->uid && set_user(new) < 0) | 640 | retval = set_user(new); |
| 636 | goto error; | 641 | if (retval < 0) |
| 637 | 642 | goto error; | |
| 643 | } | ||
| 638 | if (ruid != (uid_t) -1 || | 644 | if (ruid != (uid_t) -1 || |
| 639 | (euid != (uid_t) -1 && euid != old->uid)) | 645 | (euid != (uid_t) -1 && euid != old->uid)) |
| 640 | new->suid = new->euid; | 646 | new->suid = new->euid; |
| @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
| 680 | retval = -EPERM; | 686 | retval = -EPERM; |
| 681 | if (capable(CAP_SETUID)) { | 687 | if (capable(CAP_SETUID)) { |
| 682 | new->suid = new->uid = uid; | 688 | new->suid = new->uid = uid; |
| 683 | if (uid != old->uid && set_user(new) < 0) { | 689 | if (uid != old->uid) { |
| 684 | retval = -EAGAIN; | 690 | retval = set_user(new); |
| 685 | goto error; | 691 | if (retval < 0) |
| 692 | goto error; | ||
| 686 | } | 693 | } |
| 687 | } else if (uid != old->uid && uid != new->suid) { | 694 | } else if (uid != old->uid && uid != new->suid) { |
| 688 | goto error; | 695 | goto error; |
| @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
| 734 | goto error; | 741 | goto error; |
| 735 | } | 742 | } |
| 736 | 743 | ||
| 737 | retval = -EAGAIN; | ||
| 738 | if (ruid != (uid_t) -1) { | 744 | if (ruid != (uid_t) -1) { |
| 739 | new->uid = ruid; | 745 | new->uid = ruid; |
| 740 | if (ruid != old->uid && set_user(new) < 0) | 746 | if (ruid != old->uid) { |
| 741 | goto error; | 747 | retval = set_user(new); |
| 748 | if (retval < 0) | ||
| 749 | goto error; | ||
| 750 | } | ||
| 742 | } | 751 | } |
| 743 | if (euid != (uid_t) -1) | 752 | if (euid != (uid_t) -1) |
| 744 | new->euid = euid; | 753 | new->euid = euid; |
diff --git a/kernel/user.c b/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
| 362 | 362 | ||
| 363 | #endif | 363 | #endif |
| 364 | 364 | ||
| 365 | #if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) | ||
| 366 | /* | ||
| 367 | * We need to check if a setuid can take place. This function should be called | ||
| 368 | * before successfully completing the setuid. | ||
| 369 | */ | ||
| 370 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
| 371 | { | ||
| 372 | |||
| 373 | return sched_rt_can_attach(up->tg, tsk); | ||
| 374 | |||
| 375 | } | ||
| 376 | #else | ||
| 377 | int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) | ||
| 378 | { | ||
| 379 | return 1; | ||
| 380 | } | ||
| 381 | #endif | ||
| 382 | |||
| 365 | /* | 383 | /* |
| 366 | * Locate the user_struct for the passed UID. If found, take a ref on it. The | 384 | * Locate the user_struct for the passed UID. If found, take a ref on it. The |
| 367 | * caller must undo that ref with free_uid(). | 385 | * caller must undo that ref with free_uid(). |
