aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--Makefile2
-rw-r--r--arch/arm/kernel/setup.c13
-rw-r--r--arch/arm/mach-at91/pm.c1
-rw-r--r--arch/arm/mm/abort-ev6.S3
-rw-r--r--arch/arm/plat-s3c64xx/irq-eint.c2
-rw-r--r--arch/powerpc/platforms/86xx/gef_sbc610.c4
-rw-r--r--arch/s390/crypto/aes_s390.c2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/i387.h8
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/apic/apic.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c530
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/efi.c7
-rw-r--r--arch/x86/kernel/efi_64.c21
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/math-emu/fpu_aux.c31
-rw-r--r--arch/x86/mm/highmem_32.c9
-rw-r--r--arch/x86/mm/init.c344
-rw-r--r--arch/x86/mm/init_32.c255
-rw-r--r--arch/x86/mm/init_64.c272
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmmio.c149
-rw-r--r--arch/x86/mm/numa_32.c5
-rw-r--r--arch/x86/mm/testmmiotrace.c70
-rw-r--r--crypto/api.c15
-rw-r--r--drivers/crypto/ixp4xx_crypto.c6
-rw-r--r--drivers/crypto/padlock-aes.c2
-rw-r--r--drivers/crypto/padlock-sha.c4
-rw-r--r--drivers/dma/iop-adma.c2
-rw-r--r--drivers/dma/mv_xor.c2
-rw-r--r--drivers/gpu/drm/drm_stub.c2
-rw-r--r--drivers/i2c/busses/i2c-mv64xxx.c4
-rw-r--r--drivers/mtd/nand/orion_nand.c2
-rw-r--r--drivers/net/arm/Makefile2
-rw-r--r--drivers/net/arm/etherh.c10
-rw-r--r--drivers/video/pxafb.c2
-rw-r--r--include/linux/rcuclassic.h6
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--include/linux/rcupreempt.h15
-rw-r--r--include/linux/rcutree.h6
-rw-r--r--include/linux/sched.h4
-rw-r--r--init/main.c3
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcupreempt.c3
-rw-r--r--kernel/rcutree.c4
-rw-r--r--kernel/sched.c15
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/user.c18
68 files changed, 1638 insertions, 825 deletions
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 000000000000..607b1a016064
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
1
2Mini-HOWTO for using the earlyprintk=dbgp boot option with a
3USB2 Debug port key and a debug cable, on x86 systems.
4
5You need two computers, the 'USB debug key' special gadget and
6and two USB cables, connected like this:
7
8 [host/target] <-------> [USB debug key] <-------> [client/console]
9
101. There are three specific hardware requirements:
11
12 a.) Host/target system needs to have USB debug port capability.
13
14 You can check this capability by looking at a 'Debug port' bit in
15 the lspci -vvv output:
16
17 # lspci -vvv
18 ...
19 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
20 Subsystem: Lenovo ThinkPad T61
21 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
22 Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
23 Latency: 0
24 Interrupt: pin D routed to IRQ 19
25 Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
26 Capabilities: [50] Power Management version 2
27 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
28 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
29 Capabilities: [58] Debug port: BAR=1 offset=00a0
30 ^^^^^^^^^^^ <==================== [ HERE ]
31 Kernel driver in use: ehci_hcd
32 Kernel modules: ehci-hcd
33 ...
34
35( If your system does not list a debug port capability then you probably
36 wont be able to use the USB debug key. )
37
38 b.) You also need a Netchip USB debug cable/key:
39
40 http://www.plxtech.com/products/NET2000/NET20DC/default.asp
41
42 This is a small blue plastic connector with two USB connections,
43 it draws power from its USB connections.
44
45 c.) Thirdly, you need a second client/console system with a regular USB port.
46
472. Software requirements:
48
49 a.) On the host/target system:
50
51 You need to enable the following kernel config option:
52
53 CONFIG_EARLY_PRINTK_DBGP=y
54
55 And you need to add the boot command line: "earlyprintk=dbgp".
56 (If you are using Grub, append it to the 'kernel' line in
57 /etc/grub.conf)
58
59 NOTE: normally earlyprintk console gets turned off once the
60 regular console is alive - use "earlyprintk=dbgp,keep" to keep
61 this channel open beyond early bootup. This can be useful for
62 debugging crashes under Xorg, etc.
63
64 b.) On the client/console system:
65
66 You should enable the following kernel config option:
67
68 CONFIG_USB_SERIAL_DEBUG=y
69
70 On the next bootup with the modified kernel you should
71 get a /dev/ttyUSBx device(s).
72
73 Now this channel of kernel messages is ready to be used: start
74 your favorite terminal emulator (minicom, etc.) and set
75 it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
76 see the raw output.
77
78 c.) On Nvidia Southbridge based systems: the kernel will try to probe
79 and find out which port has debug device connected.
80
813. Testing that it works fine:
82
83 You can test the output by using earlyprintk=dbgp,keep and provoking
84 kernel messages on the host/target system. You can provoke a harmless
85 kernel message by for example doing:
86
87 echo h > /proc/sysrq-trigger
88
89 On the host/target system you should see this help line in "dmesg" output:
90
91 SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
92
93 On the client/console system do:
94
95 cat /dev/ttyUSB0
96
97 And you should see the help line above displayed shortly after you've
98 provoked it on the host system.
99
100If it does not work then please ask about it on the linux-kernel@vger.kernel.org
101mailing list or contact the x86 maintainers.
diff --git a/Makefile b/Makefile
index 27fb890a2bff..c40d83aedebe 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
1VERSION = 2 1VERSION = 2
2PATCHLEVEL = 6 2PATCHLEVEL = 6
3SUBLEVEL = 29 3SUBLEVEL = 29
4EXTRAVERSION = -rc6 4EXTRAVERSION = -rc7
5NAME = Erotic Pickled Herring 5NAME = Erotic Pickled Herring
6 6
7# *DOCUMENTATION* 7# *DOCUMENTATION*
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 7049815d66d5..68d6494c0389 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -233,12 +233,13 @@ static void __init cacheid_init(void)
233 unsigned int cachetype = read_cpuid_cachetype(); 233 unsigned int cachetype = read_cpuid_cachetype();
234 unsigned int arch = cpu_architecture(); 234 unsigned int arch = cpu_architecture();
235 235
236 if (arch >= CPU_ARCH_ARMv7) { 236 if (arch >= CPU_ARCH_ARMv6) {
237 cacheid = CACHEID_VIPT_NONALIASING; 237 if ((cachetype & (7 << 29)) == 4 << 29) {
238 if ((cachetype & (3 << 14)) == 1 << 14) 238 /* ARMv7 register format */
239 cacheid |= CACHEID_ASID_TAGGED; 239 cacheid = CACHEID_VIPT_NONALIASING;
240 } else if (arch >= CPU_ARCH_ARMv6) { 240 if ((cachetype & (3 << 14)) == 1 << 14)
241 if (cachetype & (1 << 23)) 241 cacheid |= CACHEID_ASID_TAGGED;
242 } else if (cachetype & (1 << 23))
242 cacheid = CACHEID_VIPT_ALIASING; 243 cacheid = CACHEID_VIPT_ALIASING;
243 else 244 else
244 cacheid = CACHEID_VIPT_NONALIASING; 245 cacheid = CACHEID_VIPT_NONALIASING;
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 9bb4f043aa22..7ac812dc055a 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -332,7 +332,6 @@ static int at91_pm_enter(suspend_state_t state)
332 at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR)); 332 at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR));
333 333
334error: 334error:
335 sdram_selfrefresh_disable();
336 target_state = PM_SUSPEND_ON; 335 target_state = PM_SUSPEND_ON;
337 at91_irq_resume(); 336 at91_irq_resume();
338 at91_gpio_resume(); 337 at91_gpio_resume();
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 8a7f65ba14b7..94077fbd96b7 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -23,7 +23,8 @@ ENTRY(v6_early_abort)
23#ifdef CONFIG_CPU_32v6K 23#ifdef CONFIG_CPU_32v6K
24 clrex 24 clrex
25#else 25#else
26 strex r0, r1, [sp] @ Clear the exclusive monitor 26 sub r1, sp, #4 @ Get unused stack location
27 strex r0, r1, [r1] @ Clear the exclusive monitor
27#endif 28#endif
28 mrc p15, 0, r1, c5, c0, 0 @ get FSR 29 mrc p15, 0, r1, c5, c0, 0 @ get FSR
29 mrc p15, 0, r0, c6, c0, 0 @ get FAR 30 mrc p15, 0, r0, c6, c0, 0 @ get FAR
diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c
index 1f7cc0067f5c..ebb305ce7689 100644
--- a/arch/arm/plat-s3c64xx/irq-eint.c
+++ b/arch/arm/plat-s3c64xx/irq-eint.c
@@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq)
55 u32 mask; 55 u32 mask;
56 56
57 mask = __raw_readl(S3C64XX_EINT0MASK); 57 mask = __raw_readl(S3C64XX_EINT0MASK);
58 mask |= eint_irq_to_bit(irq); 58 mask &= ~eint_irq_to_bit(irq);
59 __raw_writel(mask, S3C64XX_EINT0MASK); 59 __raw_writel(mask, S3C64XX_EINT0MASK);
60} 60}
61 61
diff --git a/arch/powerpc/platforms/86xx/gef_sbc610.c b/arch/powerpc/platforms/86xx/gef_sbc610.c
index fb371f5ce132..d6b772ba3b8f 100644
--- a/arch/powerpc/platforms/86xx/gef_sbc610.c
+++ b/arch/powerpc/platforms/86xx/gef_sbc610.c
@@ -142,6 +142,10 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev)
142{ 142{
143 unsigned int val; 143 unsigned int val;
144 144
145 /* Do not do the fixup on other platforms! */
146 if (!machine_is(gef_sbc610))
147 return;
148
145 printk(KERN_INFO "Running NEC uPD720101 Fixup\n"); 149 printk(KERN_INFO "Running NEC uPD720101 Fixup\n");
146 150
147 /* Ensure ports 1, 2, 3, 4 & 5 are enabled */ 151 /* Ensure ports 1, 2, 3, 4 & 5 are enabled */
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index c42cd898f68b..6118890c946d 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void)
556module_init(aes_s390_init); 556module_init(aes_s390_init);
557module_exit(aes_s390_fini); 557module_exit(aes_s390_fini);
558 558
559MODULE_ALIAS("aes"); 559MODULE_ALIAS("aes-all");
560 560
561MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); 561MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
562MODULE_LICENSE("GPL"); 562MODULE_LICENSE("GPL");
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f5cef3fbf9a5..31758378bcd2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -783,6 +783,11 @@ config X86_MCE_AMD
783 Additional support for AMD specific MCE features such as 783 Additional support for AMD specific MCE features such as
784 the DRAM Error Threshold. 784 the DRAM Error Threshold.
785 785
786config X86_MCE_THRESHOLD
787 depends on X86_MCE_AMD || X86_MCE_INTEL
788 bool
789 default y
790
786config X86_MCE_NONFATAL 791config X86_MCE_NONFATAL
787 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 792 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
788 depends on X86_32 && X86_MCE 793 depends on X86_32 && X86_MCE
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..bc9514fb3b13 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
53#define APIC_ESR_SENDILL 0x00020 53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040 54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080 55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_LVTCMCI 0x2f0
56#define APIC_ICR 0x300 57#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000 58#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000 59#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b6..edc90f23e708 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
40#define MAX_EFI_IO_PAGES 100
41
42extern u64 efi_call0(void *fp); 40extern u64 efi_call0(void *fp);
43extern u64 efi_call1(void *fp, u64 arg1); 41extern u64 efi_call1(void *fp, u64 arg1);
44extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); 42extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dca8f03da5b2..63a79c77d220 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -24,9 +24,6 @@
24#include <asm/kmap_types.h> 24#include <asm/kmap_types.h>
25#else 25#else
26#include <asm/vsyscall.h> 26#include <asm/vsyscall.h>
27#ifdef CONFIG_EFI
28#include <asm/efi.h>
29#endif
30#endif 27#endif
31 28
32/* 29/*
@@ -92,13 +89,6 @@ enum fixed_addresses {
92 FIX_IO_APIC_BASE_0, 89 FIX_IO_APIC_BASE_0,
93 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, 90 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
94#endif 91#endif
95#ifdef CONFIG_X86_64
96#ifdef CONFIG_EFI
97 FIX_EFI_IO_MAP_LAST_PAGE,
98 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
99 + MAX_EFI_IO_PAGES - 1,
100#endif
101#endif
102#ifdef CONFIG_X86_VISWS_APIC 92#ifdef CONFIG_X86_VISWS_APIC
103 FIX_CO_CPU, /* Cobalt timer */ 93 FIX_CO_CPU, /* Cobalt timer */
104 FIX_CO_APIC, /* Cobalt APIC Redirection Table */ 94 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c9..71c9e5183982 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
172 172
173#else /* CONFIG_X86_32 */ 173#else /* CONFIG_X86_32 */
174 174
175extern void finit(void); 175#ifdef CONFIG_MATH_EMULATION
176extern void finit_task(struct task_struct *tsk);
177#else
178static inline void finit_task(struct task_struct *tsk)
179{
180}
181#endif
176 182
177static inline void tolerant_fwait(void) 183static inline void tolerant_fwait(void)
178{ 184{
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 000000000000..36fb1a6a5109
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_INIT_32_H
2#define _ASM_X86_INIT_32_H
3
4#ifdef CONFIG_X86_32
5extern void __init early_ioremap_page_table_range_init(void);
6#endif
7
8extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start,
10 unsigned long end,
11 unsigned long page_size_mask);
12
13
14extern unsigned long __initdata e820_table_start;
15extern unsigned long __meminitdata e820_table_end;
16extern unsigned long __meminitdata e820_table_top;
17
18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960b..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
11 */ 11 */
12 12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
14 16
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
90 92
91#include <asm/atomic.h> 93#include <asm/atomic.h>
92 94
95void mce_setup(struct mce *m);
93void mce_log(struct mce *m); 96void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce); 97DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96 99
100/*
101 * To support more than 128 would need to escape the predefined
102 * Linux defined extended banks first.
103 */
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105
97#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
99#else 112#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
101#endif 118#endif
102 119
103#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif 124#endif
108 125
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status); 126extern int mce_available(struct cpuinfo_x86 *c);
127
128void mce_log_therm_throt_event(__u64 status);
110 129
111extern atomic_t mce_entry; 130extern atomic_t mce_entry;
112 131
113extern void do_machine_check(struct pt_regs *, long); 132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136
137enum mcp_flags {
138 MCP_TIMESTAMP = (1 << 0), /* log time stamp */
139 MCP_UC = (1 << 1), /* log uncorrected errors */
140};
141extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
142
114extern int mce_notify_user(void); 143extern int mce_notify_user(void);
115 144
116#endif /* !CONFIG_X86_32 */ 145#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
120#else 149#else
121#define mcheck_init(c) do { } while (0) 150#define mcheck_init(c) do { } while (0)
122#endif 151#endif
123extern void stop_mce(void); 152
124extern void restart_mce(void); 153extern void (*mce_threshold_vector)(void);
125 154
126#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
127#endif /* _ASM_X86_MCE_H */ 156#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae04..2dbd2314139e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
77#define MSR_IA32_MC0_ADDR 0x00000402 77#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403 78#define MSR_IA32_MC0_MISC 0x00000403
79 79
80/* These are consecutive and not in the normal 4er MCE bank block */
81#define MSR_IA32_MC0_CTL2 0x00000280
82#define CMCI_EN (1ULL << 30)
83#define CMCI_THRESHOLD_MASK 0xffffULL
84
80#define MSR_P6_PERFCTR0 0x000000c1 85#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2 86#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186 87#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603c..826ad37006ab 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43struct pgprot;
44
45extern int page_is_ram(unsigned long pagenr); 43extern int page_is_ram(unsigned long pagenr);
46extern int devmem_is_allowed(unsigned long pagenr); 44extern int devmem_is_allowed(unsigned long pagenr);
47extern void map_devmem(unsigned long pfn, unsigned long size,
48 struct pgprot vma_prot);
49extern void unmap_devmem(unsigned long pfn, unsigned long size,
50 struct pgprot vma_prot);
51 45
52extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
53extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838e..2cd07b9422f4 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAT_H 2#define _ASM_X86_PAT_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/pgtable_types.h>
5 6
6#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
7extern int pat_enabled; 8extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
17 18
18extern int kernel_map_sync_memtype(u64 base, unsigned long size, 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
19 unsigned long flag); 20 unsigned long flag);
21extern void map_devmem(unsigned long pfn, unsigned long size,
22 struct pgprot vma_prot);
23extern void unmap_devmem(unsigned long pfn, unsigned long size,
24 struct pgprot vma_prot);
20 25
21#endif /* _ASM_X86_PAT_H */ 26#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe04..2733fad45f98 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
25 * area for the same reason. ;) 25 * area for the same reason. ;)
26 */ 26 */
27#define VMALLOC_OFFSET (8 * 1024 * 1024) 27#define VMALLOC_OFFSET (8 * 1024 * 1024)
28
29#ifndef __ASSEMBLER__
30extern bool __vmalloc_start_set; /* set once high_memory is set */
31#endif
32
28#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) 33#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
29#ifdef CONFIG_X86_PAE 34#ifdef CONFIG_X86_PAE
30#define LAST_PKMAP 512 35#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..b8238dc8786d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
273 273
274extern pteval_t __supported_pte_mask; 274extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 275extern int nx_enabled;
276extern void set_nx(void);
276 277
277#define pgprot_writecombine pgprot_writecombine 278#define pgprot_writecombine pgprot_writecombine
278extern pgprot_t pgprot_writecombine(pgprot_t prot); 279extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d52..4c80f1557433 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 414 that might execute the to be patched code.
415 Other CPUs are not running. */ 415 Other CPUs are not running. */
416 stop_nmi(); 416 stop_nmi();
417#ifdef CONFIG_X86_MCE 417
418 stop_mce(); 418 /*
419#endif 419 * Don't stop machine check exceptions while patching.
420 * MCEs only happen when something got corrupted and in this
421 * case we must do something about the corruption.
422 * Ignoring it is worse than a unlikely patching race.
423 * Also machine checks tend to be broadcast and if one CPU
424 * goes into machine check the others follow quickly, so we don't
425 * expect a machine check to cause undue problems during to code
426 * patching.
427 */
420 428
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 429 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 430
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 464 (unsigned long)__smp_locks_end);
457 465
458 restart_nmi(); 466 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 467}
463 468
464/** 469/**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c5..30909a258d0f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/mtrr.h> 47#include <asm/mtrr.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/mce.h>
49 50
50unsigned int num_processors; 51unsigned int num_processors;
51 52
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
842 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 843 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
843 } 844 }
844#endif 845#endif
846#ifdef CONFIG_X86_MCE_INTEL
847 if (maxlvt >= 6) {
848 v = apic_read(APIC_LVTCMCI);
849 if (!(v & APIC_LVT_MASKED))
850 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
851 }
852#endif
853
845 /* 854 /*
846 * Clean APIC state for other OSs: 855 * Clean APIC state for other OSs:
847 */ 856 */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
1241 apic_write(APIC_LVT1, value); 1250 apic_write(APIC_LVT1, value);
1242 1251
1243 preempt_enable(); 1252 preempt_enable();
1253
1254#ifdef CONFIG_X86_MCE_INTEL
1255 /* Recheck CMCI information after local APIC is up on CPU #0 */
1256 if (smp_processor_id() == 0)
1257 cmci_recheck();
1258#endif
1244} 1259}
1245 1260
1246void __cpuinit end_local_APIC_setup(void) 1261void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..bfbd5323a635 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies_relative(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 896 return 0;
739} 897}
740 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
741/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 908static void mce_restart(void)
743{ 909{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
749 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 912}
753 913
754static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
755 .resume = mce_resume, 917 .resume = mce_resume,
756 .name = "machinecheck", 918 .name = "machinecheck",
757}; 919};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 940 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 942
781/* 943static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 944
783 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 946 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
791 963
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 965 char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 990 NULL
821}; 991};
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1015 if (err)
846 goto error; 1016 goto error;
847 } 1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
848 cpu_set(cpu, mce_device_initialized); 1024 cpu_set(cpu, mce_device_initialized);
849 1025
850 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
851error: 1032error:
852 while (i--) { 1033 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1035 mce_attributes[i]);
855 } 1036 }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
868 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1056 cpu_clear(cpu, mce_device_initialized);
873} 1057}
874 1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
876static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, 1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
878{ 1089{
879 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1092
881 switch (action) { 1093 switch (action) {
882 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
893 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies_relative(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
894 } 1121 }
895 return NOTIFY_OK; 1122 return NOTIFY_OK;
896} 1123}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
900}; 1127};
901 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
902static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
903{ 1158{
904 int err; 1159 int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
906 1161
907 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1163 return -EIO;
1164
1165 err = mce_init_banks();
1166 if (err)
1167 return err;
1168
909 err = sysdev_class_register(&mce_sysclass); 1169 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1170 if (err)
911 return err; 1171 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd469..c5a32f92d07e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e0..aaa7d9730938 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
@@ -13,6 +15,7 @@
13#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
16 19
17asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
18{ 21{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
25 28
26 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
27 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
28 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
29 32
30 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
31 irq_exit(); 34 irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 return; 88 return;
86} 89}
87 90
91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static __cpuinit void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
88void mce_intel_feature_init(struct cpuinfo_x86 *c) 292void mce_intel_feature_init(struct cpuinfo_x86 *c)
89{ 293{
90 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
91} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad394..1736acc4d7aa 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
469 efi_memory_desc_t *md; 469 efi_memory_desc_t *md;
470 efi_status_t status; 470 efi_status_t status;
471 unsigned long size; 471 unsigned long size;
472 u64 end, systab, addr, npages; 472 u64 end, systab, addr, npages, end_pfn;
473 void *p, *va; 473 void *p, *va;
474 474
475 efi.systab = NULL; 475 efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
481 size = md->num_pages << EFI_PAGE_SHIFT; 481 size = md->num_pages << EFI_PAGE_SHIFT;
482 end = md->phys_addr + size; 482 end = md->phys_addr + size;
483 483
484 if (PFN_UP(end) <= max_low_pfn_mapped) 484 end_pfn = PFN_UP(end);
485 if (end_pfn <= max_low_pfn_mapped
486 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
487 && end_pfn <= max_pfn_mapped))
485 va = __va(md->phys_addr); 488 va = __va(md->phys_addr);
486 else 489 else
487 va = efi_ioremap(md->phys_addr, size); 490 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fdf..22c3b7828c50 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
102{ 102{
103 static unsigned pages_mapped __initdata; 103 unsigned long last_map_pfn;
104 unsigned i, pages;
105 unsigned long offset;
106 104
107 pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); 105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
108 offset = phys_addr & ~PAGE_MASK; 106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
109 phys_addr &= PAGE_MASK;
110
111 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
112 return NULL; 107 return NULL;
113 108
114 for (i = 0; i < pages; i++) { 109 return (void __iomem *)__va(phys_addr);
115 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
116 phys_addr, PAGE_KERNEL);
117 phys_addr += PAGE_SIZE;
118 pages_mapped++;
119 }
120
121 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
122 (pages_mapped - pages)) + offset;
123} 110}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0a..f2f8540a7f3d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
136#ifdef CONFIG_X86_32 136#ifdef CONFIG_X86_32
137 if (!HAVE_HWFP) { 137 if (!HAVE_HWFP) {
138 memset(tsk->thread.xstate, 0, xstate_size); 138 memset(tsk->thread.xstate, 0, xstate_size);
139 finit(); 139 finit_task(tsk);
140 set_stopped_child_used_math(tsk); 140 set_stopped_child_used_math(tsk);
141 return 0; 141 return 0;
142 } 142 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1baf..e8192401da47 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
558 558
559static struct mpf_intel *mpf_found; 559static struct mpf_intel *mpf_found;
560 560
561static unsigned long __init get_mpc_size(unsigned long physptr)
562{
563 struct mpc_table *mpc;
564 unsigned long size;
565
566 mpc = early_ioremap(physptr, PAGE_SIZE);
567 size = mpc->length;
568 early_iounmap(mpc, PAGE_SIZE);
569 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
570
571 return size;
572}
573
561/* 574/*
562 * Scan the memory blocks for an SMP configuration block. 575 * Scan the memory blocks for an SMP configuration block.
563 */ 576 */
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
611 construct_default_ISA_mptable(mpf->feature1); 624 construct_default_ISA_mptable(mpf->feature1);
612 625
613 } else if (mpf->physptr) { 626 } else if (mpf->physptr) {
627 struct mpc_table *mpc;
628 unsigned long size;
614 629
630 size = get_mpc_size(mpf->physptr);
631 mpc = early_ioremap(mpf->physptr, size);
615 /* 632 /*
616 * Read the physical hardware table. Anything here will 633 * Read the physical hardware table. Anything here will
617 * override the defaults. 634 * override the defaults.
618 */ 635 */
619 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { 636 if (!smp_read_mpc(mpc, early)) {
620#ifdef CONFIG_X86_LOCAL_APIC 637#ifdef CONFIG_X86_LOCAL_APIC
621 smp_found_config = 0; 638 smp_found_config = 0;
622#endif 639#endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
624 "BIOS bug, MP table errors detected!...\n"); 641 "BIOS bug, MP table errors detected!...\n");
625 printk(KERN_ERR "... disabling SMP support. " 642 printk(KERN_ERR "... disabling SMP support. "
626 "(tell your hw vendor)\n"); 643 "(tell your hw vendor)\n");
644 early_iounmap(mpc, size);
627 return; 645 return;
628 } 646 }
647 early_iounmap(mpc, size);
629 648
630 if (early) 649 if (early)
631 return; 650 return;
@@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
697 716
698 if (!reserve) 717 if (!reserve)
699 return 1; 718 return 1;
700 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 719 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
701 BOOTMEM_DEFAULT); 720 BOOTMEM_DEFAULT);
702 if (mpf->physptr) { 721 if (mpf->physptr) {
703 unsigned long size = PAGE_SIZE; 722 unsigned long size = get_mpc_size(mpf->physptr);
704#ifdef CONFIG_X86_32 723#ifdef CONFIG_X86_32
705 /* 724 /*
706 * We cannot access to MPC table to compute 725 * We cannot access to MPC table to compute
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bbb..2aef36d8aca2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), 216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
217 }, 217 },
218 }, 218 },
219 { /* Handle problems with rebooting on Dell XPS710 */
220 .callback = set_bios_reboot,
221 .ident = "Dell XPS710",
222 .matches = {
223 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
224 DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
225 },
226 },
219 { } 227 { }
220}; 228};
221 229
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c54bc0d8ff3..f28c56e6bf94 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,7 +202,9 @@ struct ist_info ist_info;
202#endif 202#endif
203 203
204#else 204#else
205struct cpuinfo_x86 boot_cpu_data __read_mostly; 205struct cpuinfo_x86 boot_cpu_data __read_mostly = {
206 .x86_phys_bits = MAX_PHYSMEM_BITS,
207};
206EXPORT_SYMBOL(boot_cpu_data); 208EXPORT_SYMBOL(boot_cpu_data);
207#endif 209#endif
208 210
@@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p)
770 772
771 finish_e820_parsing(); 773 finish_e820_parsing();
772 774
775 if (efi_enabled)
776 efi_init();
777
773 dmi_scan_machine(); 778 dmi_scan_machine();
774 779
775 dmi_check_system(bad_bios_dmi_table); 780 dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p)
789 insert_resource(&iomem_resource, &data_resource); 794 insert_resource(&iomem_resource, &data_resource);
790 insert_resource(&iomem_resource, &bss_resource); 795 insert_resource(&iomem_resource, &bss_resource);
791 796
792 if (efi_enabled)
793 efi_init();
794 797
795#ifdef CONFIG_X86_32 798#ifdef CONFIG_X86_32
796 if (ppro_with_ram_bug()) { 799 if (ppro_with_ram_bug()) {
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce547..aa0987088774 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit(void) 33void finit_task(struct task_struct *tsk)
34{ 34{
35 control_word = 0x037f; 35 struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
36 partial_status = 0; 36 struct address *oaddr, *iaddr;
37 top = 0; /* We don't keep top in the status word internally. */ 37 soft->cwd = 0x037f;
38 fpu_tag_word = 0xffff; 38 soft->swd = 0;
39 soft->ftop = 0; /* We don't keep top in the status word internally. */
40 soft->twd = 0xffff;
39 /* The behaviour is different from that detailed in 41 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */ 42 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0; 43 oaddr = (struct address *)&soft->foo;
42 operand_address.selector = 0; 44 oaddr->offset = 0;
43 instruction_address.offset = 0; 45 oaddr->selector = 0;
44 instruction_address.selector = 0; 46 iaddr = (struct address *)&soft->fip;
45 instruction_address.opcode = 0; 47 iaddr->offset = 0;
46 no_ip_update = 1; 48 iaddr->selector = 0;
49 iaddr->opcode = 0;
50 soft->no_update = 1;
51}
52
53void finit(void)
54{
55 finit_task(current);
47} 56}
48 57
49/* 58/*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0e..d11745334a67 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap);
158EXPORT_SYMBOL(kmap_atomic); 158EXPORT_SYMBOL(kmap_atomic);
159EXPORT_SYMBOL(kunmap_atomic); 159EXPORT_SYMBOL(kunmap_atomic);
160 160
161#ifdef CONFIG_NUMA
162void __init set_highmem_pages_init(void) 161void __init set_highmem_pages_init(void)
163{ 162{
164 struct zone *zone; 163 struct zone *zone;
@@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void)
182 } 181 }
183 totalram_pages += totalhigh_pages; 182 totalram_pages += totalhigh_pages;
184} 183}
185#else
186void __init set_highmem_pages_init(void)
187{
188 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
189
190 totalram_pages += totalhigh_pages;
191}
192#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d8..6d63e3d1253d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
1#include <linux/ioport.h>
1#include <linux/swap.h> 2#include <linux/swap.h>
3
2#include <asm/cacheflush.h> 4#include <asm/cacheflush.h>
5#include <asm/e820.h>
6#include <asm/init.h>
3#include <asm/page.h> 7#include <asm/page.h>
8#include <asm/page_types.h>
4#include <asm/sections.h> 9#include <asm/sections.h>
5#include <asm/system.h> 10#include <asm/system.h>
11#include <asm/tlbflush.h>
12
13unsigned long __initdata e820_table_start;
14unsigned long __meminitdata e820_table_end;
15unsigned long __meminitdata e820_table_top;
16
17int after_bootmem;
18
19int direct_gbpages
20#ifdef CONFIG_DIRECT_GBPAGES
21 = 1
22#endif
23;
24
25static void __init find_early_table_space(unsigned long end, int use_pse,
26 int use_gbpages)
27{
28 unsigned long puds, pmds, ptes, tables, start;
29
30 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
31 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
32
33 if (use_gbpages) {
34 unsigned long extra;
35
36 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
37 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
38 } else
39 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
40
41 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
42
43 if (use_pse) {
44 unsigned long extra;
45
46 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
47#ifdef CONFIG_X86_32
48 extra += PMD_SIZE;
49#endif
50 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
51 } else
52 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
53
54 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
55
56#ifdef CONFIG_X86_32
57 /* for fixmap */
58 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
59#endif
60
61 /*
62 * RED-PEN putting page tables only on node 0 could
63 * cause a hotspot and fill up ZONE_DMA. The page tables
64 * need roughly 0.5KB per GB.
65 */
66#ifdef CONFIG_X86_32
67 start = 0x7000;
68 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
69 tables, PAGE_SIZE);
70#else /* CONFIG_X86_64 */
71 start = 0x8000;
72 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
73#endif
74 if (e820_table_start == -1UL)
75 panic("Cannot find space for the kernel page tables");
76
77 e820_table_start >>= PAGE_SHIFT;
78 e820_table_end = e820_table_start;
79 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
80
81 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
82 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
83}
84
85struct map_range {
86 unsigned long start;
87 unsigned long end;
88 unsigned page_size_mask;
89};
90
91#ifdef CONFIG_X86_32
92#define NR_RANGE_MR 3
93#else /* CONFIG_X86_64 */
94#define NR_RANGE_MR 5
95#endif
96
97static int save_mr(struct map_range *mr, int nr_range,
98 unsigned long start_pfn, unsigned long end_pfn,
99 unsigned long page_size_mask)
100{
101 if (start_pfn < end_pfn) {
102 if (nr_range >= NR_RANGE_MR)
103 panic("run out of range for init_memory_mapping\n");
104 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
105 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
106 mr[nr_range].page_size_mask = page_size_mask;
107 nr_range++;
108 }
109
110 return nr_range;
111}
112
113#ifdef CONFIG_X86_64
114static void __init init_gbpages(void)
115{
116 if (direct_gbpages && cpu_has_gbpages)
117 printk(KERN_INFO "Using GB pages for direct mapping\n");
118 else
119 direct_gbpages = 0;
120}
121#else
122static inline void init_gbpages(void)
123{
124}
125#endif
126
127/*
128 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
129 * This runs before bootmem is initialized and gets pages directly from
130 * the physical memory. To access them they are temporarily mapped.
131 */
132unsigned long __init_refok init_memory_mapping(unsigned long start,
133 unsigned long end)
134{
135 unsigned long page_size_mask = 0;
136 unsigned long start_pfn, end_pfn;
137 unsigned long pos;
138 unsigned long ret;
139
140 struct map_range mr[NR_RANGE_MR];
141 int nr_range, i;
142 int use_pse, use_gbpages;
143
144 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
145
146 if (!after_bootmem)
147 init_gbpages();
148
149#ifdef CONFIG_DEBUG_PAGEALLOC
150 /*
151 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
152 * This will simplify cpa(), which otherwise needs to support splitting
153 * large pages into small in interrupt context, etc.
154 */
155 use_pse = use_gbpages = 0;
156#else
157 use_pse = cpu_has_pse;
158 use_gbpages = direct_gbpages;
159#endif
160
161#ifdef CONFIG_X86_32
162#ifdef CONFIG_X86_PAE
163 set_nx();
164 if (nx_enabled)
165 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
166#endif
167
168 /* Enable PSE if available */
169 if (cpu_has_pse)
170 set_in_cr4(X86_CR4_PSE);
171
172 /* Enable PGE if available */
173 if (cpu_has_pge) {
174 set_in_cr4(X86_CR4_PGE);
175 __supported_pte_mask |= _PAGE_GLOBAL;
176 }
177#endif
178
179 if (use_gbpages)
180 page_size_mask |= 1 << PG_LEVEL_1G;
181 if (use_pse)
182 page_size_mask |= 1 << PG_LEVEL_2M;
183
184 memset(mr, 0, sizeof(mr));
185 nr_range = 0;
186
187 /* head if not big page alignment ? */
188 start_pfn = start >> PAGE_SHIFT;
189 pos = start_pfn << PAGE_SHIFT;
190#ifdef CONFIG_X86_32
191 /*
192 * Don't use a large page for the first 2/4MB of memory
193 * because there are often fixed size MTRRs in there
194 * and overlapping MTRRs into large pages can cause
195 * slowdowns.
196 */
197 if (pos == 0)
198 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
199 else
200 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
201 << (PMD_SHIFT - PAGE_SHIFT);
202#else /* CONFIG_X86_64 */
203 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
204 << (PMD_SHIFT - PAGE_SHIFT);
205#endif
206 if (end_pfn > (end >> PAGE_SHIFT))
207 end_pfn = end >> PAGE_SHIFT;
208 if (start_pfn < end_pfn) {
209 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
210 pos = end_pfn << PAGE_SHIFT;
211 }
212
213 /* big page (2M) range */
214 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
215 << (PMD_SHIFT - PAGE_SHIFT);
216#ifdef CONFIG_X86_32
217 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
218#else /* CONFIG_X86_64 */
219 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
220 << (PUD_SHIFT - PAGE_SHIFT);
221 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
222 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
223#endif
224
225 if (start_pfn < end_pfn) {
226 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
227 page_size_mask & (1<<PG_LEVEL_2M));
228 pos = end_pfn << PAGE_SHIFT;
229 }
230
231#ifdef CONFIG_X86_64
232 /* big page (1G) range */
233 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
234 << (PUD_SHIFT - PAGE_SHIFT);
235 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
236 if (start_pfn < end_pfn) {
237 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
238 page_size_mask &
239 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
240 pos = end_pfn << PAGE_SHIFT;
241 }
242
243 /* tail is not big page (1G) alignment */
244 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
245 << (PMD_SHIFT - PAGE_SHIFT);
246 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
247 if (start_pfn < end_pfn) {
248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
249 page_size_mask & (1<<PG_LEVEL_2M));
250 pos = end_pfn << PAGE_SHIFT;
251 }
252#endif
253
254 /* tail is not big page (2M) alignment */
255 start_pfn = pos>>PAGE_SHIFT;
256 end_pfn = end>>PAGE_SHIFT;
257 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
258
259 /* try to merge same page size and continuous */
260 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
261 unsigned long old_start;
262 if (mr[i].end != mr[i+1].start ||
263 mr[i].page_size_mask != mr[i+1].page_size_mask)
264 continue;
265 /* move it */
266 old_start = mr[i].start;
267 memmove(&mr[i], &mr[i+1],
268 (nr_range - 1 - i) * sizeof(struct map_range));
269 mr[i--].start = old_start;
270 nr_range--;
271 }
272
273 for (i = 0; i < nr_range; i++)
274 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
275 mr[i].start, mr[i].end,
276 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
277 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
278
279 /*
280 * Find space for the kernel direct mapping tables.
281 *
282 * Later we should allocate these tables in the local node of the
283 * memory mapped. Unfortunately this is done currently before the
284 * nodes are discovered.
285 */
286 if (!after_bootmem)
287 find_early_table_space(end, use_pse, use_gbpages);
288
289#ifdef CONFIG_X86_32
290 for (i = 0; i < nr_range; i++)
291 kernel_physical_mapping_init(mr[i].start, mr[i].end,
292 mr[i].page_size_mask);
293 ret = end;
294#else /* CONFIG_X86_64 */
295 for (i = 0; i < nr_range; i++)
296 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
297 mr[i].page_size_mask);
298#endif
299
300#ifdef CONFIG_X86_32
301 early_ioremap_page_table_range_init();
302
303 load_cr3(swapper_pg_dir);
304#endif
305
306#ifdef CONFIG_X86_64
307 if (!after_bootmem)
308 mmu_cr4_features = read_cr4();
309#endif
310 __flush_tlb_all();
311
312 if (!after_bootmem && e820_table_end > e820_table_start)
313 reserve_early(e820_table_start << PAGE_SHIFT,
314 e820_table_end << PAGE_SHIFT, "PGTABLE");
315
316 if (!after_bootmem)
317 early_memtest(start, end);
318
319 return ret >> PAGE_SHIFT;
320}
321
322
323/*
324 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
325 * is valid. The argument is a physical page number.
326 *
327 *
328 * On x86, access has to be given to the first megabyte of ram because that area
329 * contains bios code and data regions used by X and dosemu and similar apps.
330 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
331 * mmio resources as well as potential bios/acpi data regions.
332 */
333int devmem_is_allowed(unsigned long pagenr)
334{
335 if (pagenr <= 256)
336 return 1;
337 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
338 return 0;
339 if (!page_is_ram(pagenr))
340 return 1;
341 return 0;
342}
6 343
7void free_init_pages(char *what, unsigned long begin, unsigned long end) 344void free_init_pages(char *what, unsigned long begin, unsigned long end)
8{ 345{
@@ -47,3 +384,10 @@ void free_initmem(void)
47 (unsigned long)(&__init_begin), 384 (unsigned long)(&__init_begin),
48 (unsigned long)(&__init_end)); 385 (unsigned long)(&__init_end));
49} 386}
387
388#ifdef CONFIG_BLK_DEV_INITRD
389void free_initrd_mem(unsigned long start, unsigned long end)
390{
391 free_init_pages("initrd memory", start, end);
392}
393#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 47df0e1bbeb9..2966c6b8d304 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/init.h>
52 53
53unsigned long max_low_pfn_mapped; 54unsigned long max_low_pfn_mapped;
54unsigned long max_pfn_mapped; 55unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
58 59
59static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
60 61
61 62bool __read_mostly __vmalloc_start_set = false;
62static unsigned long __initdata table_start;
63static unsigned long __meminitdata table_end;
64static unsigned long __meminitdata table_top;
65
66static int __initdata after_init_bootmem;
67 63
68static __init void *alloc_low_page(void) 64static __init void *alloc_low_page(void)
69{ 65{
70 unsigned long pfn = table_end++; 66 unsigned long pfn = e820_table_end++;
71 void *adr; 67 void *adr;
72 68
73 if (pfn >= table_top) 69 if (pfn >= e820_table_top)
74 panic("alloc_low_page: ran out of memory"); 70 panic("alloc_low_page: ran out of memory");
75 71
76 adr = __va(pfn * PAGE_SIZE); 72 adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
90 86
91#ifdef CONFIG_X86_PAE 87#ifdef CONFIG_X86_PAE
92 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 88 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
93 if (after_init_bootmem) 89 if (after_bootmem)
94 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 90 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
95 else 91 else
96 pmd_table = (pmd_t *)alloc_low_page(); 92 pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
117 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 113 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
118 pte_t *page_table = NULL; 114 pte_t *page_table = NULL;
119 115
120 if (after_init_bootmem) { 116 if (after_bootmem) {
121#ifdef CONFIG_DEBUG_PAGEALLOC 117#ifdef CONFIG_DEBUG_PAGEALLOC
122 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
123#endif 119#endif
@@ -168,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
168 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 164 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
169 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 165 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
170 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 166 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
171 && ((__pa(pte) >> PAGE_SHIFT) < table_start 167 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
172 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { 168 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
173 pte_t *newpte; 169 pte_t *newpte;
174 int i; 170 int i;
175 171
176 BUG_ON(after_init_bootmem); 172 BUG_ON(after_bootmem);
177 newpte = alloc_low_page(); 173 newpte = alloc_low_page();
178 for (i = 0; i < PTRS_PER_PTE; i++) 174 for (i = 0; i < PTRS_PER_PTE; i++)
179 set_pte(newpte + i, pte[i]); 175 set_pte(newpte + i, pte[i]);
@@ -242,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
242 * of max_low_pfn pages, by creating page tables starting from address 238 * of max_low_pfn pages, by creating page tables starting from address
243 * PAGE_OFFSET: 239 * PAGE_OFFSET:
244 */ 240 */
245static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 241unsigned long __init
246 unsigned long start_pfn, 242kernel_physical_mapping_init(unsigned long start,
247 unsigned long end_pfn, 243 unsigned long end,
248 int use_pse) 244 unsigned long page_size_mask)
249{ 245{
246 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
247 unsigned long start_pfn, end_pfn;
248 pgd_t *pgd_base = swapper_pg_dir;
250 int pgd_idx, pmd_idx, pte_ofs; 249 int pgd_idx, pmd_idx, pte_ofs;
251 unsigned long pfn; 250 unsigned long pfn;
252 pgd_t *pgd; 251 pgd_t *pgd;
@@ -255,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
255 unsigned pages_2m, pages_4k; 254 unsigned pages_2m, pages_4k;
256 int mapping_iter; 255 int mapping_iter;
257 256
257 start_pfn = start >> PAGE_SHIFT;
258 end_pfn = end >> PAGE_SHIFT;
259
258 /* 260 /*
259 * First iteration will setup identity mapping using large/small pages 261 * First iteration will setup identity mapping using large/small pages
260 * based on use_pse, with other attributes same as set by 262 * based on use_pse, with other attributes same as set by
@@ -369,26 +371,6 @@ repeat:
369 mapping_iter = 2; 371 mapping_iter = 2;
370 goto repeat; 372 goto repeat;
371 } 373 }
372}
373
374/*
375 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
376 * is valid. The argument is a physical page number.
377 *
378 *
379 * On x86, access has to be given to the first megabyte of ram because that area
380 * contains bios code and data regions used by X and dosemu and similar apps.
381 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
382 * mmio resources as well as potential bios/acpi data regions.
383 */
384int devmem_is_allowed(unsigned long pagenr)
385{
386 if (pagenr <= 256)
387 return 1;
388 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
389 return 0;
390 if (!page_is_ram(pagenr))
391 return 1;
392 return 0; 374 return 0;
393} 375}
394 376
@@ -545,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
545 * be partially populated, and so it avoids stomping on any existing 527 * be partially populated, and so it avoids stomping on any existing
546 * mappings. 528 * mappings.
547 */ 529 */
548static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) 530void __init early_ioremap_page_table_range_init(void)
549{ 531{
532 pgd_t *pgd_base = swapper_pg_dir;
550 unsigned long vaddr, end; 533 unsigned long vaddr, end;
551 534
552 /* 535 /*
@@ -641,7 +624,7 @@ static int __init noexec_setup(char *str)
641} 624}
642early_param("noexec", noexec_setup); 625early_param("noexec", noexec_setup);
643 626
644static void __init set_nx(void) 627void __init set_nx(void)
645{ 628{
646 unsigned int v[4], l, h; 629 unsigned int v[4], l, h;
647 630
@@ -793,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
793#ifdef CONFIG_FLATMEM 776#ifdef CONFIG_FLATMEM
794 max_mapnr = num_physpages; 777 max_mapnr = num_physpages;
795#endif 778#endif
779 __vmalloc_start_set = true;
780
796 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 781 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
797 pages_to_mb(max_low_pfn)); 782 pages_to_mb(max_low_pfn));
798 783
@@ -814,176 +799,61 @@ static void __init zone_sizes_init(void)
814 free_area_init_nodes(max_zone_pfns); 799 free_area_init_nodes(max_zone_pfns);
815} 800}
816 801
802static unsigned long __init setup_node_bootmem(int nodeid,
803 unsigned long start_pfn,
804 unsigned long end_pfn,
805 unsigned long bootmap)
806{
807 unsigned long bootmap_size;
808
809 if (start_pfn > max_low_pfn)
810 return bootmap;
811 if (end_pfn > max_low_pfn)
812 end_pfn = max_low_pfn;
813
814 /* don't touch min_low_pfn */
815 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
816 bootmap >> PAGE_SHIFT,
817 start_pfn, end_pfn);
818 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
819 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
820 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
821 nodeid, bootmap, bootmap + bootmap_size);
822 free_bootmem_with_active_regions(nodeid, end_pfn);
823 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
824
825 return bootmap + bootmap_size;
826}
827
817void __init setup_bootmem_allocator(void) 828void __init setup_bootmem_allocator(void)
818{ 829{
819 int i; 830 int nodeid;
820 unsigned long bootmap_size, bootmap; 831 unsigned long bootmap_size, bootmap;
821 /* 832 /*
822 * Initialize the boot-time allocator (with low memory only): 833 * Initialize the boot-time allocator (with low memory only):
823 */ 834 */
824 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 835 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
825 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 836 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
826 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
827 PAGE_SIZE); 837 PAGE_SIZE);
828 if (bootmap == -1L) 838 if (bootmap == -1L)
829 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 839 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
830 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 840 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
831 841
832 /* don't touch min_low_pfn */
833 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
834 min_low_pfn, max_low_pfn);
835 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 842 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
836 max_pfn_mapped<<PAGE_SHIFT); 843 max_pfn_mapped<<PAGE_SHIFT);
837 printk(KERN_INFO " low ram: %08lx - %08lx\n", 844 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
838 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
839 printk(KERN_INFO " bootmap %08lx - %08lx\n",
840 bootmap, bootmap + bootmap_size);
841 for_each_online_node(i)
842 free_bootmem_with_active_regions(i, max_low_pfn);
843 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
844
845 after_init_bootmem = 1;
846}
847
848static void __init find_early_table_space(unsigned long end, int use_pse)
849{
850 unsigned long puds, pmds, ptes, tables, start;
851
852 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
853 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
854
855 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
856 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
857
858 if (use_pse) {
859 unsigned long extra;
860
861 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
862 extra += PMD_SIZE;
863 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
864 } else
865 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
866 845
867 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); 846#ifdef CONFIG_NEED_MULTIPLE_NODES
868 847 for_each_online_node(nodeid)
869 /* for fixmap */ 848 bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
870 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 849 node_end_pfn[nodeid], bootmap);
871
872 /*
873 * RED-PEN putting page tables only on node 0 could
874 * cause a hotspot and fill up ZONE_DMA. The page tables
875 * need roughly 0.5KB per GB.
876 */
877 start = 0x7000;
878 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
879 tables, PAGE_SIZE);
880 if (table_start == -1UL)
881 panic("Cannot find space for the kernel page tables");
882
883 table_start >>= PAGE_SHIFT;
884 table_end = table_start;
885 table_top = table_start + (tables>>PAGE_SHIFT);
886
887 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
888 end, table_start << PAGE_SHIFT,
889 (table_start << PAGE_SHIFT) + tables);
890}
891
892unsigned long __init_refok init_memory_mapping(unsigned long start,
893 unsigned long end)
894{
895 pgd_t *pgd_base = swapper_pg_dir;
896 unsigned long start_pfn, end_pfn;
897 unsigned long big_page_start;
898#ifdef CONFIG_DEBUG_PAGEALLOC
899 /*
900 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
901 * This will simplify cpa(), which otherwise needs to support splitting
902 * large pages into small in interrupt context, etc.
903 */
904 int use_pse = 0;
905#else 850#else
906 int use_pse = cpu_has_pse; 851 bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
907#endif
908
909 /*
910 * Find space for the kernel direct mapping tables.
911 */
912 if (!after_init_bootmem)
913 find_early_table_space(end, use_pse);
914
915#ifdef CONFIG_X86_PAE
916 set_nx();
917 if (nx_enabled)
918 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
919#endif 852#endif
920 853
921 /* Enable PSE if available */ 854 after_bootmem = 1;
922 if (cpu_has_pse)
923 set_in_cr4(X86_CR4_PSE);
924
925 /* Enable PGE if available */
926 if (cpu_has_pge) {
927 set_in_cr4(X86_CR4_PGE);
928 __supported_pte_mask |= _PAGE_GLOBAL;
929 }
930
931 /*
932 * Don't use a large page for the first 2/4MB of memory
933 * because there are often fixed size MTRRs in there
934 * and overlapping MTRRs into large pages can cause
935 * slowdowns.
936 */
937 big_page_start = PMD_SIZE;
938
939 if (start < big_page_start) {
940 start_pfn = start >> PAGE_SHIFT;
941 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
942 } else {
943 /* head is not big page alignment ? */
944 start_pfn = start >> PAGE_SHIFT;
945 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
946 << (PMD_SHIFT - PAGE_SHIFT);
947 }
948 if (start_pfn < end_pfn)
949 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
950
951 /* big page range */
952 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
953 << (PMD_SHIFT - PAGE_SHIFT);
954 if (start_pfn < (big_page_start >> PAGE_SHIFT))
955 start_pfn = big_page_start >> PAGE_SHIFT;
956 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
957 if (start_pfn < end_pfn)
958 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
959 use_pse);
960
961 /* tail is not big page alignment ? */
962 start_pfn = end_pfn;
963 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
964 end_pfn = end >> PAGE_SHIFT;
965 if (start_pfn < end_pfn)
966 kernel_physical_mapping_init(pgd_base, start_pfn,
967 end_pfn, 0);
968 }
969
970 early_ioremap_page_table_range_init(pgd_base);
971
972 load_cr3(swapper_pg_dir);
973
974 __flush_tlb_all();
975
976 if (!after_init_bootmem)
977 reserve_early(table_start << PAGE_SHIFT,
978 table_end << PAGE_SHIFT, "PGTABLE");
979
980 if (!after_init_bootmem)
981 early_memtest(start, end);
982
983 return end >> PAGE_SHIFT;
984} 855}
985 856
986
987/* 857/*
988 * paging_init() sets up the page tables - note that the first 8MB are 858 * paging_init() sets up the page tables - note that the first 8MB are
989 * already mapped by head.S. 859 * already mapped by head.S.
@@ -1217,13 +1087,6 @@ void mark_rodata_ro(void)
1217} 1087}
1218#endif 1088#endif
1219 1089
1220#ifdef CONFIG_BLK_DEV_INITRD
1221void free_initrd_mem(unsigned long start, unsigned long end)
1222{
1223 free_init_pages("initrd memory", start, end);
1224}
1225#endif
1226
1227int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1090int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1228 int flags) 1091 int flags)
1229{ 1092{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 07f44d491df1..8a853bc3b287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
48#include <asm/kdebug.h> 48#include <asm/kdebug.h>
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h>
51 52
52/* 53/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
61 62
62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
63 64
64int direct_gbpages
65#ifdef CONFIG_DIRECT_GBPAGES
66 = 1
67#endif
68;
69
70static int __init parse_direct_gbpages_off(char *arg) 65static int __init parse_direct_gbpages_off(char *arg)
71{ 66{
72 direct_gbpages = 0; 67 direct_gbpages = 0;
@@ -87,8 +82,6 @@ early_param("gbpages", parse_direct_gbpages_on);
87 * around without checking the pgd every time. 82 * around without checking the pgd every time.
88 */ 83 */
89 84
90int after_bootmem;
91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask); 86EXPORT_SYMBOL_GPL(__supported_pte_mask);
94 87
@@ -325,13 +318,9 @@ void __init cleanup_highmap(void)
325 } 318 }
326} 319}
327 320
328static unsigned long __initdata table_start;
329static unsigned long __meminitdata table_end;
330static unsigned long __meminitdata table_top;
331
332static __ref void *alloc_low_page(unsigned long *phys) 321static __ref void *alloc_low_page(unsigned long *phys)
333{ 322{
334 unsigned long pfn = table_end++; 323 unsigned long pfn = e820_table_end++;
335 void *adr; 324 void *adr;
336 325
337 if (after_bootmem) { 326 if (after_bootmem) {
@@ -341,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
341 return adr; 330 return adr;
342 } 331 }
343 332
344 if (pfn >= table_top) 333 if (pfn >= e820_table_top)
345 panic("alloc_low_page: ran out of memory"); 334 panic("alloc_low_page: ran out of memory");
346 335
347 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 336 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -581,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
581 return phys_pud_init(pud, addr, end, page_size_mask); 570 return phys_pud_init(pud, addr, end, page_size_mask);
582} 571}
583 572
584static void __init find_early_table_space(unsigned long end, int use_pse, 573unsigned long __init
585 int use_gbpages) 574kernel_physical_mapping_init(unsigned long start,
586{ 575 unsigned long end,
587 unsigned long puds, pmds, ptes, tables, start; 576 unsigned long page_size_mask)
588
589 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
590 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
591 if (use_gbpages) {
592 unsigned long extra;
593 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
594 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
595 } else
596 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
597 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
598
599 if (use_pse) {
600 unsigned long extra;
601 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
602 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
603 } else
604 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
605 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
606
607 /*
608 * RED-PEN putting page tables only on node 0 could
609 * cause a hotspot and fill up ZONE_DMA. The page tables
610 * need roughly 0.5KB per GB.
611 */
612 start = 0x8000;
613 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
614 if (table_start == -1UL)
615 panic("Cannot find space for the kernel page tables");
616
617 table_start >>= PAGE_SHIFT;
618 table_end = table_start;
619 table_top = table_start + (tables >> PAGE_SHIFT);
620
621 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
622 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
623}
624
625static void __init init_gbpages(void)
626{
627 if (direct_gbpages && cpu_has_gbpages)
628 printk(KERN_INFO "Using GB pages for direct mapping\n");
629 else
630 direct_gbpages = 0;
631}
632
633static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
634 unsigned long end,
635 unsigned long page_size_mask)
636{ 577{
637 578
638 unsigned long next, last_map_addr = end; 579 unsigned long next, last_map_addr = end;
@@ -669,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
669 return last_map_addr; 610 return last_map_addr;
670} 611}
671 612
672struct map_range {
673 unsigned long start;
674 unsigned long end;
675 unsigned page_size_mask;
676};
677
678#define NR_RANGE_MR 5
679
680static int save_mr(struct map_range *mr, int nr_range,
681 unsigned long start_pfn, unsigned long end_pfn,
682 unsigned long page_size_mask)
683{
684
685 if (start_pfn < end_pfn) {
686 if (nr_range >= NR_RANGE_MR)
687 panic("run out of range for init_memory_mapping\n");
688 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
689 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
690 mr[nr_range].page_size_mask = page_size_mask;
691 nr_range++;
692 }
693
694 return nr_range;
695}
696
697/*
698 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
699 * This runs before bootmem is initialized and gets pages directly from
700 * the physical memory. To access them they are temporarily mapped.
701 */
702unsigned long __init_refok init_memory_mapping(unsigned long start,
703 unsigned long end)
704{
705 unsigned long last_map_addr = 0;
706 unsigned long page_size_mask = 0;
707 unsigned long start_pfn, end_pfn;
708 unsigned long pos;
709
710 struct map_range mr[NR_RANGE_MR];
711 int nr_range, i;
712 int use_pse, use_gbpages;
713
714 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
715
716 /*
717 * Find space for the kernel direct mapping tables.
718 *
719 * Later we should allocate these tables in the local node of the
720 * memory mapped. Unfortunately this is done currently before the
721 * nodes are discovered.
722 */
723 if (!after_bootmem)
724 init_gbpages();
725
726#ifdef CONFIG_DEBUG_PAGEALLOC
727 /*
728 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
729 * This will simplify cpa(), which otherwise needs to support splitting
730 * large pages into small in interrupt context, etc.
731 */
732 use_pse = use_gbpages = 0;
733#else
734 use_pse = cpu_has_pse;
735 use_gbpages = direct_gbpages;
736#endif
737
738 if (use_gbpages)
739 page_size_mask |= 1 << PG_LEVEL_1G;
740 if (use_pse)
741 page_size_mask |= 1 << PG_LEVEL_2M;
742
743 memset(mr, 0, sizeof(mr));
744 nr_range = 0;
745
746 /* head if not big page alignment ?*/
747 start_pfn = start >> PAGE_SHIFT;
748 pos = start_pfn << PAGE_SHIFT;
749 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
750 << (PMD_SHIFT - PAGE_SHIFT);
751 if (end_pfn > (end >> PAGE_SHIFT))
752 end_pfn = end >> PAGE_SHIFT;
753 if (start_pfn < end_pfn) {
754 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
755 pos = end_pfn << PAGE_SHIFT;
756 }
757
758 /* big page (2M) range*/
759 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
760 << (PMD_SHIFT - PAGE_SHIFT);
761 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
762 << (PUD_SHIFT - PAGE_SHIFT);
763 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
764 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
765 if (start_pfn < end_pfn) {
766 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
767 page_size_mask & (1<<PG_LEVEL_2M));
768 pos = end_pfn << PAGE_SHIFT;
769 }
770
771 /* big page (1G) range */
772 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
773 << (PUD_SHIFT - PAGE_SHIFT);
774 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
775 if (start_pfn < end_pfn) {
776 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
777 page_size_mask &
778 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
779 pos = end_pfn << PAGE_SHIFT;
780 }
781
782 /* tail is not big page (1G) alignment */
783 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
784 << (PMD_SHIFT - PAGE_SHIFT);
785 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
786 if (start_pfn < end_pfn) {
787 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
788 page_size_mask & (1<<PG_LEVEL_2M));
789 pos = end_pfn << PAGE_SHIFT;
790 }
791
792 /* tail is not big page (2M) alignment */
793 start_pfn = pos>>PAGE_SHIFT;
794 end_pfn = end>>PAGE_SHIFT;
795 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
796
797 /* try to merge same page size and continuous */
798 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
799 unsigned long old_start;
800 if (mr[i].end != mr[i+1].start ||
801 mr[i].page_size_mask != mr[i+1].page_size_mask)
802 continue;
803 /* move it */
804 old_start = mr[i].start;
805 memmove(&mr[i], &mr[i+1],
806 (nr_range - 1 - i) * sizeof (struct map_range));
807 mr[i--].start = old_start;
808 nr_range--;
809 }
810
811 for (i = 0; i < nr_range; i++)
812 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
813 mr[i].start, mr[i].end,
814 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
815 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
816
817 if (!after_bootmem)
818 find_early_table_space(end, use_pse, use_gbpages);
819
820 for (i = 0; i < nr_range; i++)
821 last_map_addr = kernel_physical_mapping_init(
822 mr[i].start, mr[i].end,
823 mr[i].page_size_mask);
824
825 if (!after_bootmem)
826 mmu_cr4_features = read_cr4();
827 __flush_tlb_all();
828
829 if (!after_bootmem && table_end > table_start)
830 reserve_early(table_start << PAGE_SHIFT,
831 table_end << PAGE_SHIFT, "PGTABLE");
832
833 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
834 last_map_addr, end);
835
836 if (!after_bootmem)
837 early_memtest(start, end);
838
839 return last_map_addr >> PAGE_SHIFT;
840}
841
842#ifndef CONFIG_NUMA 613#ifndef CONFIG_NUMA
843void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 614void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
844{ 615{
@@ -910,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
910 681
911#endif /* CONFIG_MEMORY_HOTPLUG */ 682#endif /* CONFIG_MEMORY_HOTPLUG */
912 683
913/*
914 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
915 * is valid. The argument is a physical page number.
916 *
917 *
918 * On x86, access has to be given to the first megabyte of ram because that area
919 * contains bios code and data regions used by X and dosemu and similar apps.
920 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
921 * mmio resources as well as potential bios/acpi data regions.
922 */
923int devmem_is_allowed(unsigned long pagenr)
924{
925 if (pagenr <= 256)
926 return 1;
927 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
928 return 0;
929 if (!page_is_ram(pagenr))
930 return 1;
931 return 0;
932}
933
934
935static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 684static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
936 kcore_modules, kcore_vsyscall; 685 kcore_modules, kcore_vsyscall;
937 686
@@ -1019,13 +768,6 @@ void mark_rodata_ro(void)
1019 768
1020#endif 769#endif
1021 770
1022#ifdef CONFIG_BLK_DEV_INITRD
1023void free_initrd_mem(unsigned long start, unsigned long end)
1024{
1025 free_init_pages("initrd memory", start, end);
1026}
1027#endif
1028
1029int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 771int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1030 int flags) 772 int flags)
1031{ 773{
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648a..62773abdf088 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x)
38 } else { 38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET; 40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : 41 VIRTUAL_BUG_ON(!phys_addr_valid(x));
42 !phys_addr_valid(x));
43 } 42 }
44 return x; 43 return x;
45} 44}
@@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x)
56 if (x < PAGE_OFFSET) 55 if (x < PAGE_OFFSET)
57 return false; 56 return false;
58 x -= PAGE_OFFSET; 57 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ? 58 if (!phys_addr_valid(x))
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false; 59 return false;
62 }
63 } 60 }
64 61
65 return pfn_valid(x >> PAGE_SHIFT); 62 return pfn_valid(x >> PAGE_SHIFT);
@@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr)
76#ifdef CONFIG_DEBUG_VIRTUAL 73#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x) 74unsigned long __phys_addr(unsigned long x)
78{ 75{
79 /* VMALLOC_* aren't constants; not available at the boot time */ 76 /* VMALLOC_* aren't constants */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 77 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && 78 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET; 79 return x - PAGE_OFFSET;
84} 80}
85EXPORT_SYMBOL(__phys_addr); 81EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +85,7 @@ bool __virt_addr_valid(unsigned long x)
89{ 85{
90 if (x < PAGE_OFFSET) 86 if (x < PAGE_OFFSET)
91 return false; 87 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) 88 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
93 return false; 89 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); 90 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95} 91}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93d82038af4b..9f205030d9aa 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,11 +32,14 @@ struct kmmio_fault_page {
32 struct list_head list; 32 struct list_head list;
33 struct kmmio_fault_page *release_next; 33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */ 34 unsigned long page; /* location of the fault page */
35 bool old_presence; /* page presence prior to arming */
36 bool armed;
35 37
36 /* 38 /*
37 * Number of times this page has been registered as a part 39 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed. 40 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU). 41 * Used only by writers (RCU) and post_kmmio_handler().
42 * Protected by kmmio_lock, when linked into kmmio_page_table.
40 */ 43 */
41 int count; 44 int count;
42}; 45};
@@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
105 return NULL; 108 return NULL;
106} 109}
107 110
108static void set_page_present(unsigned long addr, bool present, 111static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
109 unsigned int *pglevel) 112{
113 pmdval_t v = pmd_val(*pmd);
114 *old = !!(v & _PAGE_PRESENT);
115 v &= ~_PAGE_PRESENT;
116 if (present)
117 v |= _PAGE_PRESENT;
118 set_pmd(pmd, __pmd(v));
119}
120
121static void set_pte_presence(pte_t *pte, bool present, bool *old)
122{
123 pteval_t v = pte_val(*pte);
124 *old = !!(v & _PAGE_PRESENT);
125 v &= ~_PAGE_PRESENT;
126 if (present)
127 v |= _PAGE_PRESENT;
128 set_pte_atomic(pte, __pte(v));
129}
130
131static int set_page_presence(unsigned long addr, bool present, bool *old)
110{ 132{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level; 133 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level); 134 pte_t *pte = lookup_address(addr, &level);
116 135
117 if (!pte) { 136 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr); 137 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return; 138 return -1;
120 } 139 }
121 140
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) { 141 switch (level) {
126 case PG_LEVEL_2M: 142 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte; 143 set_pmd_presence((pmd_t *)pte, present, old);
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break; 144 break;
133
134 case PG_LEVEL_4K: 145 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT; 146 set_pte_presence(pte, present, old);
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break; 147 break;
140
141 default: 148 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level); 149 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return; 150 return -1;
144 } 151 }
145 152
146 __flush_tlb_one(addr); 153 __flush_tlb_one(addr);
154 return 0;
147} 155}
148 156
149/** Mark the given page as not present. Access to it will trigger a fault. */ 157/*
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) 158 * Mark the given page as not present. Access to it will trigger a fault.
159 *
160 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
161 * protection is ignored here. RCU read lock is assumed held, so the struct
162 * will not disappear unexpectedly. Furthermore, the caller must guarantee,
163 * that double arming the same virtual address (page) cannot occur.
164 *
165 * Double disarming on the other hand is allowed, and may occur when a fault
166 * and mmiotrace shutdown happen simultaneously.
167 */
168static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
151{ 169{
152 set_page_present(page & PAGE_MASK, false, pglevel); 170 int ret;
171 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
172 if (f->armed) {
173 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
174 f->page, f->count, f->old_presence);
175 }
176 ret = set_page_presence(f->page, false, &f->old_presence);
177 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
178 f->armed = true;
179 return ret;
153} 180}
154 181
155/** Mark the given page as present. */ 182/** Restore the given page to saved presence state. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) 183static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
157{ 184{
158 set_page_present(page & PAGE_MASK, true, pglevel); 185 bool tmp;
186 int ret = set_page_presence(f->page, f->old_presence, &tmp);
187 WARN_ONCE(ret < 0,
188 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
189 f->armed = false;
159} 190}
160 191
161/* 192/*
@@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
202 233
203 ctx = &get_cpu_var(kmmio_ctx); 234 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) { 235 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) { 236 if (addr == ctx->addr) {
207 /* 237 /*
208 * On SMP we sometimes get recursive probe hits on the 238 * A second fault on the same page means some other
209 * same address. Context is already saved, fall out. 239 * condition needs handling by do_page_fault(), the
240 * page really not being present is the most common.
210 */ 241 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for " 242 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
212 "address 0x%08lx.\n", 243 addr, smp_processor_id());
213 smp_processor_id(), addr); 244
214 ret = 1; 245 if (!faultpage->old_presence)
215 goto no_kmmio_ctx; 246 pr_info("kmmio: unexpected secondary hit for "
216 } 247 "address 0x%08lx on CPU %d.\n", addr,
217 /* 248 smp_processor_id());
218 * Prevent overwriting already in-flight context. 249 } else {
219 * This should not happen, let's hope disarming at least 250 /*
220 * prevents a panic. 251 * Prevent overwriting already in-flight context.
221 */ 252 * This should not happen, let's hope disarming at
222 pr_emerg("kmmio: recursive probe hit on CPU %d, " 253 * least prevents a panic.
254 */
255 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n", 256 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr); 257 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n", 258 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr); 259 ctx->addr);
260 disarm_kmmio_fault_page(faultpage);
261 }
227 goto no_kmmio_ctx; 262 goto no_kmmio_ctx;
228 } 263 }
229 ctx->active++; 264 ctx->active++;
@@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
244 regs->flags &= ~X86_EFLAGS_IF; 279 regs->flags &= ~X86_EFLAGS_IF;
245 280
246 /* Now we set present bit in PTE and single step. */ 281 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL); 282 disarm_kmmio_fault_page(ctx->fpage);
248 283
249 /* 284 /*
250 * If another cpu accesses the same page while we are stepping, 285 * If another cpu accesses the same page while we are stepping,
@@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276 311
277 if (!ctx->active) { 312 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n", 313 pr_warning("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id()); 314 smp_processor_id());
280 goto out; 315 goto out;
281 } 316 }
@@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
283 if (ctx->probe && ctx->probe->post_handler) 318 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs); 319 ctx->probe->post_handler(ctx->probe, condition, regs);
285 320
286 arm_kmmio_fault_page(ctx->fpage->page, NULL); 321 /* Prevent racing against release_kmmio_fault_page(). */
322 spin_lock(&kmmio_lock);
323 if (ctx->fpage->count)
324 arm_kmmio_fault_page(ctx->fpage);
325 spin_unlock(&kmmio_lock);
287 326
288 regs->flags &= ~X86_EFLAGS_TF; 327 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags; 328 regs->flags |= ctx->saved_flags;
@@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page)
315 f = get_kmmio_fault_page(page); 354 f = get_kmmio_fault_page(page);
316 if (f) { 355 if (f) {
317 if (!f->count) 356 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL); 357 arm_kmmio_fault_page(f);
319 f->count++; 358 f->count++;
320 return 0; 359 return 0;
321 } 360 }
322 361
323 f = kmalloc(sizeof(*f), GFP_ATOMIC); 362 f = kzalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f) 363 if (!f)
325 return -1; 364 return -1;
326 365
327 f->count = 1; 366 f->count = 1;
328 f->page = page; 367 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330 368
331 arm_kmmio_fault_page(f->page, NULL); 369 if (arm_kmmio_fault_page(f)) {
370 kfree(f);
371 return -1;
372 }
373
374 list_add_rcu(&f->list, kmmio_page_list(f->page));
332 375
333 return 0; 376 return 0;
334} 377}
@@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page,
347 f->count--; 390 f->count--;
348 BUG_ON(f->count < 0); 391 BUG_ON(f->count < 0);
349 if (!f->count) { 392 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL); 393 disarm_kmmio_fault_page(f);
351 f->release_next = *release_list; 394 f->release_next = *release_list;
352 *release_list = f; 395 *release_list = f;
353 } 396 }
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a0352..3daefa04ace5 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
416 for_each_online_node(nid) 416 for_each_online_node(nid)
417 propagate_e820_map_node(nid); 417 propagate_e820_map_node(nid);
418 418
419 for_each_online_node(nid) 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
422 }
421 423
422 NODE_DATA(0)->bdata = &bootmem_node_data[0];
423 setup_bootmem_allocator(); 424 setup_bootmem_allocator();
424} 425}
425 426
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index ab50a8d7402c..427fd1b56df5 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/io.h> 5#include <linux/io.h>
@@ -9,35 +9,74 @@
9 9
10static unsigned long mmio_address; 10static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 11module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); 12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
13 "(or 8 MB if read_far is non-zero).");
14
15static unsigned long read_far = 0x400100;
16module_param(read_far, ulong, 0);
17MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
18 "(default: 0x400100).");
19
20static unsigned v16(unsigned i)
21{
22 return i * 12 + 7;
23}
24
25static unsigned v32(unsigned i)
26{
27 return i * 212371 + 13;
28}
13 29
14static void do_write_test(void __iomem *p) 30static void do_write_test(void __iomem *p)
15{ 31{
16 unsigned int i; 32 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n");
17 mmiotrace_printk("Write test.\n"); 34 mmiotrace_printk("Write test.\n");
35
18 for (i = 0; i < 256; i++) 36 for (i = 0; i < 256; i++)
19 iowrite8(i, p + i); 37 iowrite8(i, p + i);
38
20 for (i = 1024; i < (5 * 1024); i += 2) 39 for (i = 1024; i < (5 * 1024); i += 2)
21 iowrite16(i * 12 + 7, p + i); 40 iowrite16(v16(i), p + i);
41
22 for (i = (5 * 1024); i < (16 * 1024); i += 4) 42 for (i = (5 * 1024); i < (16 * 1024); i += 4)
23 iowrite32(i * 212371 + 13, p + i); 43 iowrite32(v32(i), p + i);
24} 44}
25 45
26static void do_read_test(void __iomem *p) 46static void do_read_test(void __iomem *p)
27{ 47{
28 unsigned int i; 48 unsigned int i;
49 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n");
29 mmiotrace_printk("Read test.\n"); 51 mmiotrace_printk("Read test.\n");
52
30 for (i = 0; i < 256; i++) 53 for (i = 0; i < 256; i++)
31 ioread8(p + i); 54 if (ioread8(p + i) != i)
55 ++errs[0];
56
32 for (i = 1024; i < (5 * 1024); i += 2) 57 for (i = 1024; i < (5 * 1024); i += 2)
33 ioread16(p + i); 58 if (ioread16(p + i) != v16(i))
59 ++errs[1];
60
34 for (i = (5 * 1024); i < (16 * 1024); i += 4) 61 for (i = (5 * 1024); i < (16 * 1024); i += 4)
35 ioread32(p + i); 62 if (ioread32(p + i) != v32(i))
63 ++errs[2];
64
65 mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
66 errs[0], errs[1], errs[2]);
36} 67}
37 68
38static void do_test(void) 69static void do_read_far_test(void __iomem *p)
39{ 70{
40 void __iomem *p = ioremap_nocache(mmio_address, 0x4000); 71 pr_info(MODULE_NAME ": read far test.\n");
72 mmiotrace_printk("Read far test.\n");
73
74 ioread32(p + read_far);
75}
76
77static void do_test(unsigned long size)
78{
79 void __iomem *p = ioremap_nocache(mmio_address, size);
41 if (!p) { 80 if (!p) {
42 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
43 return; 82 return;
@@ -45,11 +84,15 @@ static void do_test(void)
45 mmiotrace_printk("ioremap returned %p.\n", p); 84 mmiotrace_printk("ioremap returned %p.\n", p);
46 do_write_test(p); 85 do_write_test(p);
47 do_read_test(p); 86 do_read_test(p);
87 if (read_far && read_far < size - 4)
88 do_read_far_test(p);
48 iounmap(p); 89 iounmap(p);
49} 90}
50 91
51static int __init init(void) 92static int __init init(void)
52{ 93{
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95
53 if (mmio_address == 0) { 96 if (mmio_address == 0) {
54 pr_err(MODULE_NAME ": you have to use the module argument " 97 pr_err(MODULE_NAME ": you have to use the module argument "
55 "mmio_address.\n"); 98 "mmio_address.\n");
@@ -58,10 +101,11 @@ static int __init init(void)
58 return -ENXIO; 101 return -ENXIO;
59 } 102 }
60 103
61 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " 104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
62 "in PCI address space, and writing " 105 "address space, and writing 16 kB of rubbish in there.\n",
63 "rubbish in there.\n", mmio_address); 106 size >> 10, mmio_address);
64 do_test(); 107 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n");
65 return 0; 109 return 0;
66} 110}
67 111
diff --git a/crypto/api.c b/crypto/api.c
index efe77df6863f..38a2bc02a98c 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -215,8 +215,19 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask)
215 mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); 215 mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
216 type &= mask; 216 type &= mask;
217 217
218 alg = try_then_request_module(crypto_alg_lookup(name, type, mask), 218 alg = crypto_alg_lookup(name, type, mask);
219 name); 219 if (!alg) {
220 char tmp[CRYPTO_MAX_ALG_NAME];
221
222 request_module(name);
223
224 if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) &&
225 snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp))
226 request_module(tmp);
227
228 alg = crypto_alg_lookup(name, type, mask);
229 }
230
220 if (alg) 231 if (alg)
221 return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg; 232 return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg;
222 233
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 2d637e0fbc03..d9e751be8c5f 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -457,10 +457,12 @@ static int init_ixp_crypto(void)
457 if (!ctx_pool) { 457 if (!ctx_pool) {
458 goto err; 458 goto err;
459 } 459 }
460 ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0); 460 ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0,
461 "ixp_crypto:out", NULL);
461 if (ret) 462 if (ret)
462 goto err; 463 goto err;
463 ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0); 464 ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0,
465 "ixp_crypto:in", NULL);
464 if (ret) { 466 if (ret) {
465 qmgr_release_queue(SEND_QID); 467 qmgr_release_queue(SEND_QID);
466 goto err; 468 goto err;
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 856b3cc25583..3f0fdd18255d 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support");
489MODULE_LICENSE("GPL"); 489MODULE_LICENSE("GPL");
490MODULE_AUTHOR("Michal Ludvig"); 490MODULE_AUTHOR("Michal Ludvig");
491 491
492MODULE_ALIAS("aes"); 492MODULE_ALIAS("aes-all");
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index a7fbadebf623..a2c8e8514b63 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support.");
304MODULE_LICENSE("GPL"); 304MODULE_LICENSE("GPL");
305MODULE_AUTHOR("Michal Ludvig"); 305MODULE_AUTHOR("Michal Ludvig");
306 306
307MODULE_ALIAS("sha1"); 307MODULE_ALIAS("sha1-all");
308MODULE_ALIAS("sha256"); 308MODULE_ALIAS("sha256-all");
309MODULE_ALIAS("sha1-padlock"); 309MODULE_ALIAS("sha1-padlock");
310MODULE_ALIAS("sha256-padlock"); 310MODULE_ALIAS("sha256-padlock");
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index ea5440dd10dc..647374acba94 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma");
1401 1401
1402static struct platform_driver iop_adma_driver = { 1402static struct platform_driver iop_adma_driver = {
1403 .probe = iop_adma_probe, 1403 .probe = iop_adma_probe,
1404 .remove = iop_adma_remove, 1404 .remove = __devexit_p(iop_adma_remove),
1405 .driver = { 1405 .driver = {
1406 .owner = THIS_MODULE, 1406 .owner = THIS_MODULE,
1407 .name = "iop-adma", 1407 .name = "iop-adma",
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index d35cbd1ff0b3..5d5d5b31867f 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp,
1287 1287
1288static struct platform_driver mv_xor_driver = { 1288static struct platform_driver mv_xor_driver = {
1289 .probe = mv_xor_probe, 1289 .probe = mv_xor_probe,
1290 .remove = mv_xor_remove, 1290 .remove = __devexit_p(mv_xor_remove),
1291 .driver = { 1291 .driver = {
1292 .owner = THIS_MODULE, 1292 .owner = THIS_MODULE,
1293 .name = MV_XOR_NAME, 1293 .name = MV_XOR_NAME,
diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c
index 096e2a37446d..7c8b15b22bf2 100644
--- a/drivers/gpu/drm/drm_stub.c
+++ b/drivers/gpu/drm/drm_stub.c
@@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data,
168 file_priv->minor->master != file_priv->master) { 168 file_priv->minor->master != file_priv->master) {
169 mutex_lock(&dev->struct_mutex); 169 mutex_lock(&dev->struct_mutex);
170 file_priv->minor->master = drm_master_get(file_priv->master); 170 file_priv->minor->master = drm_master_get(file_priv->master);
171 mutex_lock(&dev->struct_mutex); 171 mutex_unlock(&dev->struct_mutex);
172 } 172 }
173 173
174 return 0; 174 return 0;
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index eeda276f8f16..7f186bbcb99d 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd,
482 return 0; 482 return 0;
483} 483}
484 484
485static void __devexit 485static void
486mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data) 486mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data)
487{ 487{
488 if (drv_data->reg_base) { 488 if (drv_data->reg_base) {
@@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev)
577 577
578static struct platform_driver mv64xxx_i2c_driver = { 578static struct platform_driver mv64xxx_i2c_driver = {
579 .probe = mv64xxx_i2c_probe, 579 .probe = mv64xxx_i2c_probe,
580 .remove = mv64xxx_i2c_remove, 580 .remove = __devexit_p(mv64xxx_i2c_remove),
581 .driver = { 581 .driver = {
582 .owner = THIS_MODULE, 582 .owner = THIS_MODULE,
583 .name = MV64XXX_I2C_CTLR_NAME, 583 .name = MV64XXX_I2C_CTLR_NAME,
diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c
index 917cf8d3ae95..c2dfd3ea353d 100644
--- a/drivers/mtd/nand/orion_nand.c
+++ b/drivers/mtd/nand/orion_nand.c
@@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev)
149 149
150static struct platform_driver orion_nand_driver = { 150static struct platform_driver orion_nand_driver = {
151 .probe = orion_nand_probe, 151 .probe = orion_nand_probe,
152 .remove = orion_nand_remove, 152 .remove = __devexit_p(orion_nand_remove),
153 .driver = { 153 .driver = {
154 .name = "orion_nand", 154 .name = "orion_nand",
155 .owner = THIS_MODULE, 155 .owner = THIS_MODULE,
diff --git a/drivers/net/arm/Makefile b/drivers/net/arm/Makefile
index c69c0cdba4a2..811a3ccd14c1 100644
--- a/drivers/net/arm/Makefile
+++ b/drivers/net/arm/Makefile
@@ -4,7 +4,7 @@
4# 4#
5 5
6obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o 6obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o
7obj-$(CONFIG_ARM_ETHERH) += etherh.o ../8390.o 7obj-$(CONFIG_ARM_ETHERH) += etherh.o
8obj-$(CONFIG_ARM_ETHER3) += ether3.o 8obj-$(CONFIG_ARM_ETHER3) += ether3.o
9obj-$(CONFIG_ARM_ETHER1) += ether1.o 9obj-$(CONFIG_ARM_ETHER1) += ether1.o
10obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o 10obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o
diff --git a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c
index 54b52e5b1821..f52f668c49bf 100644
--- a/drivers/net/arm/etherh.c
+++ b/drivers/net/arm/etherh.c
@@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = {
641 .ndo_open = etherh_open, 641 .ndo_open = etherh_open,
642 .ndo_stop = etherh_close, 642 .ndo_stop = etherh_close,
643 .ndo_set_config = etherh_set_config, 643 .ndo_set_config = etherh_set_config,
644 .ndo_start_xmit = ei_start_xmit, 644 .ndo_start_xmit = __ei_start_xmit,
645 .ndo_tx_timeout = ei_tx_timeout, 645 .ndo_tx_timeout = __ei_tx_timeout,
646 .ndo_get_stats = ei_get_stats, 646 .ndo_get_stats = __ei_get_stats,
647 .ndo_set_multicast_list = ei_set_multicast_list, 647 .ndo_set_multicast_list = __ei_set_multicast_list,
648 .ndo_validate_addr = eth_validate_addr, 648 .ndo_validate_addr = eth_validate_addr,
649 .ndo_set_mac_address = eth_mac_addr, 649 .ndo_set_mac_address = eth_mac_addr,
650 .ndo_change_mtu = eth_change_mtu, 650 .ndo_change_mtu = eth_change_mtu,
651#ifdef CONFIG_NET_POLL_CONTROLLER 651#ifdef CONFIG_NET_POLL_CONTROLLER
652 .ndo_poll_controller = ei_poll, 652 .ndo_poll_controller = __ei_poll,
653#endif 653#endif
654}; 654};
655 655
diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c
index 48ff701d3a72..2552b9f325ee 100644
--- a/drivers/video/pxafb.c
+++ b/drivers/video/pxafb.c
@@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev)
2230 2230
2231static struct platform_driver pxafb_driver = { 2231static struct platform_driver pxafb_driver = {
2232 .probe = pxafb_probe, 2232 .probe = pxafb_probe,
2233 .remove = pxafb_remove, 2233 .remove = __devexit_p(pxafb_remove),
2234 .suspend = pxafb_suspend, 2234 .suspend = pxafb_suspend,
2235 .resume = pxafb_resume, 2235 .resume = pxafb_resume,
2236 .driver = { 2236 .driver = {
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index f3f697df1d71..80044a4f3ab9 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void);
181#define rcu_enter_nohz() do { } while (0) 181#define rcu_enter_nohz() do { } while (0)
182#define rcu_exit_nohz() do { } while (0) 182#define rcu_exit_nohz() do { } while (0)
183 183
184/* A context switch is a grace period for rcuclassic. */
185static inline int rcu_blocking_is_gp(void)
186{
187 return num_online_cpus() == 1;
188}
189
184#endif /* __LINUX_RCUCLASSIC_H */ 190#endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 921340a7b71c..528343e6da51 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,6 +52,9 @@ struct rcu_head {
52 void (*func)(struct rcu_head *head); 52 void (*func)(struct rcu_head *head);
53}; 53};
54 54
55/* Internal to kernel, but needed by rcupreempt.h. */
56extern int rcu_scheduler_active;
57
55#if defined(CONFIG_CLASSIC_RCU) 58#if defined(CONFIG_CLASSIC_RCU)
56#include <linux/rcuclassic.h> 59#include <linux/rcuclassic.h>
57#elif defined(CONFIG_TREE_RCU) 60#elif defined(CONFIG_TREE_RCU)
@@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void);
265 268
266/* Internal to kernel */ 269/* Internal to kernel */
267extern void rcu_init(void); 270extern void rcu_init(void);
271extern void rcu_scheduler_starting(void);
268extern int rcu_needs_cpu(int cpu); 272extern int rcu_needs_cpu(int cpu);
269 273
270#endif /* __LINUX_RCUPDATE_H */ 274#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 3e05c09b54a2..74304b4538d8 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void)
142#define rcu_exit_nohz() do { } while (0) 142#define rcu_exit_nohz() do { } while (0)
143#endif /* CONFIG_NO_HZ */ 143#endif /* CONFIG_NO_HZ */
144 144
145/*
146 * A context switch is a grace period for rcupreempt synchronize_rcu()
147 * only during early boot, before the scheduler has been initialized.
148 * So, how the heck do we get a context switch? Well, if the caller
149 * invokes synchronize_rcu(), they are willing to accept a context
150 * switch, so we simply pretend that one happened.
151 *
152 * After boot, there might be a blocked or preempted task in an RCU
153 * read-side critical section, so we cannot then take the fastpath.
154 */
155static inline int rcu_blocking_is_gp(void)
156{
157 return num_online_cpus() == 1 && !rcu_scheduler_active;
158}
159
145#endif /* __LINUX_RCUPREEMPT_H */ 160#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d4368b7975c3..a722fb67bb2d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void)
326} 326}
327#endif /* CONFIG_NO_HZ */ 327#endif /* CONFIG_NO_HZ */
328 328
329/* A context switch is a grace period for rcutree. */
330static inline int rcu_blocking_is_gp(void)
331{
332 return num_online_cpus() == 1;
333}
334
329#endif /* __LINUX_RCUTREE_H */ 335#endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f0a50b20e8a0..a7c7698583bb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2303,9 +2303,13 @@ extern long sched_group_rt_runtime(struct task_group *tg);
2303extern int sched_group_set_rt_period(struct task_group *tg, 2303extern int sched_group_set_rt_period(struct task_group *tg,
2304 long rt_period_us); 2304 long rt_period_us);
2305extern long sched_group_rt_period(struct task_group *tg); 2305extern long sched_group_rt_period(struct task_group *tg);
2306extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
2306#endif 2307#endif
2307#endif 2308#endif
2308 2309
2310extern int task_can_switch_user(struct user_struct *up,
2311 struct task_struct *tsk);
2312
2309#ifdef CONFIG_TASK_XACCT 2313#ifdef CONFIG_TASK_XACCT
2310static inline void add_rchar(struct task_struct *tsk, ssize_t amt) 2314static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
2311{ 2315{
diff --git a/init/main.c b/init/main.c
index 6441083f8273..6bf83afd654d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -98,7 +98,7 @@ static inline void mark_rodata_ro(void) { }
98extern void tc_init(void); 98extern void tc_init(void);
99#endif 99#endif
100 100
101enum system_states system_state; 101enum system_states system_state __read_mostly;
102EXPORT_SYMBOL(system_state); 102EXPORT_SYMBOL(system_state);
103 103
104/* 104/*
@@ -464,6 +464,7 @@ static noinline void __init_refok rest_init(void)
464 * at least once to get things moving: 464 * at least once to get things moving:
465 */ 465 */
466 init_idle_bootup_task(current); 466 init_idle_bootup_task(current);
467 rcu_scheduler_starting();
467 preempt_enable_no_resched(); 468 preempt_enable_no_resched();
468 schedule(); 469 schedule();
469 preempt_disable(); 470 preempt_disable();
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index bd5a9003497c..654c640a6b9c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)
679void rcu_check_callbacks(int cpu, int user) 679void rcu_check_callbacks(int cpu, int user)
680{ 680{
681 if (user || 681 if (user ||
682 (idle_cpu(cpu) && !in_softirq() && 682 (idle_cpu(cpu) && rcu_scheduler_active &&
683 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 683 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
684 684
685 /* 685 /*
686 * Get here if this CPU took its interrupt from user 686 * Get here if this CPU took its interrupt from user
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a881aa..cae8a059cf47 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48enum rcu_barrier { 49enum rcu_barrier {
49 RCU_BARRIER_STD, 50 RCU_BARRIER_STD,
@@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
55static atomic_t rcu_barrier_cpu_count; 56static atomic_t rcu_barrier_cpu_count;
56static DEFINE_MUTEX(rcu_barrier_mutex); 57static DEFINE_MUTEX(rcu_barrier_mutex);
57static struct completion rcu_barrier_completion; 58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly;
58 60
59/* 61/*
60 * Awaken the corresponding synchronize_rcu() instance now that a 62 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head)
80void synchronize_rcu(void) 82void synchronize_rcu(void)
81{ 83{
82 struct rcu_synchronize rcu; 84 struct rcu_synchronize rcu;
85
86 if (rcu_blocking_is_gp())
87 return;
88
83 init_completion(&rcu.completion); 89 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */ 90 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu); 91 call_rcu(&rcu.head, wakeme_after_rcu);
@@ -175,3 +181,9 @@ void __init rcu_init(void)
175 __rcu_init(); 181 __rcu_init();
176} 182}
177 183
184void rcu_scheduler_starting(void)
185{
186 WARN_ON(num_online_cpus() != 1);
187 WARN_ON(nr_context_switches() > 0);
188 rcu_scheduler_active = 1;
189}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 33cfc50781f9..5d59e850fb71 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1181,6 +1181,9 @@ void __synchronize_sched(void)
1181{ 1181{
1182 struct rcu_synchronize rcu; 1182 struct rcu_synchronize rcu;
1183 1183
1184 if (num_online_cpus() == 1)
1185 return; /* blocking is gp if only one CPU! */
1186
1184 init_completion(&rcu.completion); 1187 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */ 1188 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1189 call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b2fd602a6f6f..97ce31579ec0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
948void rcu_check_callbacks(int cpu, int user) 948void rcu_check_callbacks(int cpu, int user)
949{ 949{
950 if (user || 950 if (user ||
951 (idle_cpu(cpu) && !in_softirq() && 951 (idle_cpu(cpu) && rcu_scheduler_active &&
952 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 952 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
953 953
954 /* 954 /*
955 * Get here if this CPU took its interrupt from user 955 * Get here if this CPU took its interrupt from user
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e5c38e1c8b5..0a76d0b6f215 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{ 223{
224 ktime_t now; 224 ktime_t now;
225 225
226 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) 226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return; 227 return;
228 228
229 if (hrtimer_active(&rt_b->rt_period_timer)) 229 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -9219,6 +9219,16 @@ static int sched_rt_global_constraints(void)
9219 9219
9220 return ret; 9220 return ret;
9221} 9221}
9222
9223int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9224{
9225 /* Don't accept realtime tasks when there is no way for them to run */
9226 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9227 return 0;
9228
9229 return 1;
9230}
9231
9222#else /* !CONFIG_RT_GROUP_SCHED */ 9232#else /* !CONFIG_RT_GROUP_SCHED */
9223static int sched_rt_global_constraints(void) 9233static int sched_rt_global_constraints(void)
9224{ 9234{
@@ -9312,8 +9322,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9312 struct task_struct *tsk) 9322 struct task_struct *tsk)
9313{ 9323{
9314#ifdef CONFIG_RT_GROUP_SCHED 9324#ifdef CONFIG_RT_GROUP_SCHED
9315 /* Don't accept realtime tasks when there is no way for them to run */ 9325 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9316 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9317 return -EINVAL; 9326 return -EINVAL;
9318#else 9327#else
9319 /* We don't support RT-tasks being in separate groups */ 9328 /* We don't support RT-tasks being in separate groups */
diff --git a/kernel/sys.c b/kernel/sys.c
index f145c415bc16..37f458e6882a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -559,7 +559,7 @@ error:
559 abort_creds(new); 559 abort_creds(new);
560 return retval; 560 return retval;
561} 561}
562 562
563/* 563/*
564 * change the user struct in a credentials set to match the new UID 564 * change the user struct in a credentials set to match the new UID
565 */ 565 */
@@ -571,6 +571,11 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) {
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
574 if (atomic_read(&new_user->processes) >= 579 if (atomic_read(&new_user->processes) >=
575 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 580 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
576 new_user != INIT_USER) { 581 new_user != INIT_USER) {
@@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
631 goto error; 636 goto error;
632 } 637 }
633 638
634 retval = -EAGAIN; 639 if (new->uid != old->uid) {
635 if (new->uid != old->uid && set_user(new) < 0) 640 retval = set_user(new);
636 goto error; 641 if (retval < 0)
637 642 goto error;
643 }
638 if (ruid != (uid_t) -1 || 644 if (ruid != (uid_t) -1 ||
639 (euid != (uid_t) -1 && euid != old->uid)) 645 (euid != (uid_t) -1 && euid != old->uid))
640 new->suid = new->euid; 646 new->suid = new->euid;
@@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
680 retval = -EPERM; 686 retval = -EPERM;
681 if (capable(CAP_SETUID)) { 687 if (capable(CAP_SETUID)) {
682 new->suid = new->uid = uid; 688 new->suid = new->uid = uid;
683 if (uid != old->uid && set_user(new) < 0) { 689 if (uid != old->uid) {
684 retval = -EAGAIN; 690 retval = set_user(new);
685 goto error; 691 if (retval < 0)
692 goto error;
686 } 693 }
687 } else if (uid != old->uid && uid != new->suid) { 694 } else if (uid != old->uid && uid != new->suid) {
688 goto error; 695 goto error;
@@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
734 goto error; 741 goto error;
735 } 742 }
736 743
737 retval = -EAGAIN;
738 if (ruid != (uid_t) -1) { 744 if (ruid != (uid_t) -1) {
739 new->uid = ruid; 745 new->uid = ruid;
740 if (ruid != old->uid && set_user(new) < 0) 746 if (ruid != old->uid) {
741 goto error; 747 retval = set_user(new);
748 if (retval < 0)
749 goto error;
750 }
742 } 751 }
743 if (euid != (uid_t) -1) 752 if (euid != (uid_t) -1)
744 new->euid = euid; 753 new->euid = euid;
diff --git a/kernel/user.c b/kernel/user.c
index 3551ac742395..6a9b696128c8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags)
362 362
363#endif 363#endif
364 364
365#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
366/*
367 * We need to check if a setuid can take place. This function should be called
368 * before successfully completing the setuid.
369 */
370int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
371{
372
373 return sched_rt_can_attach(up->tg, tsk);
374
375}
376#else
377int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
378{
379 return 1;
380}
381#endif
382
365/* 383/*
366 * Locate the user_struct for the passed UID. If found, take a ref on it. The 384 * Locate the user_struct for the passed UID. If found, take a ref on it. The
367 * caller must undo that ref with free_uid(). 385 * caller must undo that ref with free_uid().