aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/x86_64/boot-options.txt132
-rw-r--r--Documentation/x86_64/cpu-hotplug-spec2
-rw-r--r--Documentation/x86_64/kernel-stacks26
-rw-r--r--Documentation/x86_64/machinecheck70
-rw-r--r--Documentation/x86_64/mm.txt22
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/i386/Kconfig18
-rw-r--r--arch/i386/Kconfig.cpu5
-rw-r--r--arch/i386/Kconfig.debug2
-rw-r--r--arch/i386/defconfig51
-rw-r--r--arch/i386/kernel/Makefile3
-rw-r--r--arch/i386/kernel/apic.c6
-rw-r--r--arch/i386/kernel/apm.c26
-rw-r--r--arch/i386/kernel/asm-offsets.c2
-rw-r--r--arch/i386/kernel/cpu/common.c14
-rw-r--r--arch/i386/kernel/cpu/cyrix.c52
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c30
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/i386/kernel/cpu/proc.c14
-rw-r--r--arch/i386/kernel/cpu/transmeta.c5
-rw-r--r--arch/i386/kernel/cpuid.c7
-rw-r--r--arch/i386/kernel/e820.c18
-rw-r--r--arch/i386/kernel/entry.S78
-rw-r--r--arch/i386/kernel/head.S38
-rw-r--r--arch/i386/kernel/io_apic.c4
-rw-r--r--arch/i386/kernel/irq.c3
-rw-r--r--arch/i386/kernel/kprobes.c6
-rw-r--r--arch/i386/kernel/microcode.c2
-rw-r--r--arch/i386/kernel/msr.c13
-rw-r--r--arch/i386/kernel/nmi.c98
-rw-r--r--arch/i386/kernel/paravirt.c116
-rw-r--r--arch/i386/kernel/pcspeaker.c20
-rw-r--r--arch/i386/kernel/process.c99
-rw-r--r--arch/i386/kernel/ptrace.c16
-rw-r--r--arch/i386/kernel/setup.c35
-rw-r--r--arch/i386/kernel/signal.c16
-rw-r--r--arch/i386/kernel/smp.c7
-rw-r--r--arch/i386/kernel/smpboot.c16
-rw-r--r--arch/i386/kernel/sysenter.c2
-rw-r--r--arch/i386/kernel/time.c14
-rw-r--r--arch/i386/kernel/traps.c27
-rw-r--r--arch/i386/kernel/tsc.c26
-rw-r--r--arch/i386/kernel/vm86.c33
-rw-r--r--arch/i386/kernel/vmi.c949
-rw-r--r--arch/i386/kernel/vmitime.c499
-rw-r--r--arch/i386/kernel/vmlinux.lds.S7
-rw-r--r--arch/i386/math-emu/get_address.c14
-rw-r--r--arch/i386/math-emu/status_w.h8
-rw-r--r--arch/i386/mm/discontig.c1
-rw-r--r--arch/i386/mm/fault.c18
-rw-r--r--arch/i386/mm/init.c4
-rw-r--r--arch/i386/mm/pageattr.c2
-rw-r--r--arch/i386/mm/pgtable.c26
-rw-r--r--arch/i386/oprofile/op_model_ppro.c9
-rw-r--r--arch/i386/pci/Makefile2
-rw-r--r--arch/i386/pci/mmconfig-shared.c264
-rw-r--r--arch/i386/pci/mmconfig.c96
-rw-r--r--arch/i386/pci/pci.h10
-rw-r--r--arch/x86_64/Kconfig22
-rw-r--r--arch/x86_64/defconfig45
-rw-r--r--arch/x86_64/ia32/ia32_signal.c11
-rw-r--r--arch/x86_64/ia32/ia32entry.S1
-rw-r--r--arch/x86_64/kernel/Makefile2
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c2
-rw-r--r--arch/x86_64/kernel/e820.c38
-rw-r--r--arch/x86_64/kernel/head.S20
-rw-r--r--arch/x86_64/kernel/io_apic.c24
-rw-r--r--arch/x86_64/kernel/ioport.c2
-rw-r--r--arch/x86_64/kernel/irq.c12
-rw-r--r--arch/x86_64/kernel/mce.c66
-rw-r--r--arch/x86_64/kernel/mce_amd.c44
-rw-r--r--arch/x86_64/kernel/nmi.c75
-rw-r--r--arch/x86_64/kernel/pci-calgary.c17
-rw-r--r--arch/x86_64/kernel/pci-dma.c28
-rw-r--r--arch/x86_64/kernel/pci-gart.c4
-rw-r--r--arch/x86_64/kernel/ptrace.c8
-rw-r--r--arch/x86_64/kernel/setup.c169
-rw-r--r--arch/x86_64/kernel/setup64.c1
-rw-r--r--arch/x86_64/kernel/stacktrace.c5
-rw-r--r--arch/x86_64/kernel/time.c14
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c5
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/copy_user_nocache.S217
-rw-r--r--arch/x86_64/mm/fault.c18
-rw-r--r--arch/x86_64/mm/numa.c202
-rw-r--r--arch/x86_64/mm/pageattr.c4
-rw-r--r--arch/x86_64/pci/Makefile3
-rw-r--r--arch/x86_64/pci/mmconfig.c116
-rw-r--r--drivers/acpi/namespace/nsinit.c9
-rw-r--r--drivers/kvm/vmx.c12
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--include/asm-generic/pgtable.h13
-rw-r--r--include/asm-i386/apic.h2
-rw-r--r--include/asm-i386/bugs.h2
-rw-r--r--include/asm-i386/desc.h2
-rw-r--r--include/asm-i386/elf.h4
-rw-r--r--include/asm-i386/idle.h14
-rw-r--r--include/asm-i386/mce.h2
-rw-r--r--include/asm-i386/mmu_context.h2
-rw-r--r--include/asm-i386/paravirt.h168
-rw-r--r--include/asm-i386/pda.h12
-rw-r--r--include/asm-i386/pgalloc.h30
-rw-r--r--include/asm-i386/processor.h14
-rw-r--r--include/asm-i386/ptrace.h8
-rw-r--r--include/asm-i386/segment.h19
-rw-r--r--include/asm-i386/setup.h2
-rw-r--r--include/asm-i386/smp.h5
-rw-r--r--include/asm-i386/time.h1
-rw-r--r--include/asm-i386/timer.h3
-rw-r--r--include/asm-i386/vmi.h262
-rw-r--r--include/asm-i386/vmi_time.h103
-rw-r--r--include/asm-x86_64/bitops.h2
-rw-r--r--include/asm-x86_64/dma-mapping.h3
-rw-r--r--include/asm-x86_64/e820.h2
-rw-r--r--include/asm-x86_64/hw_irq.h2
-rw-r--r--include/asm-x86_64/io.h2
-rw-r--r--include/asm-x86_64/io_apic.h14
-rw-r--r--include/asm-x86_64/mce.h2
-rw-r--r--include/asm-x86_64/mmzone.h18
-rw-r--r--include/asm-x86_64/mutex.h6
-rw-r--r--include/asm-x86_64/pgalloc.h5
-rw-r--r--include/asm-x86_64/pgtable.h9
-rw-r--r--include/asm-x86_64/uaccess.h14
-rw-r--r--include/asm-x86_64/vsyscall.h5
-rw-r--r--include/linux/binfmts.h1
-rw-r--r--include/linux/nmi.h9
-rw-r--r--include/linux/time.h2
-rw-r--r--init/main.c81
-rw-r--r--kernel/kmod.c44
-rw-r--r--kernel/sched.c7
-rw-r--r--kernel/timer.c4
-rw-r--r--scripts/mod/modpost.c10
137 files changed, 4113 insertions, 1103 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d25acd51e181..22b19962a1a2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -104,6 +104,9 @@ loader, and have no meaning to the kernel directly.
104Do not modify the syntax of boot loader parameters without extreme 104Do not modify the syntax of boot loader parameters without extreme
105need or coordination with <Documentation/i386/boot.txt>. 105need or coordination with <Documentation/i386/boot.txt>.
106 106
107There are also arch-specific kernel-parameters not documented here.
108See for example <Documentation/x86_64/boot-options.txt>.
109
107Note that ALL kernel parameters listed below are CASE SENSITIVE, and that 110Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
108a trailing = on the name of any parameter states that that parameter will 111a trailing = on the name of any parameter states that that parameter will
109be entered as an environment variable, whereas its absence indicates that 112be entered as an environment variable, whereas its absence indicates that
@@ -361,6 +364,11 @@ and is between 256 and 4096 characters. It is defined in the file
361 clocksource is not available, it defaults to PIT. 364 clocksource is not available, it defaults to PIT.
362 Format: { pit | tsc | cyclone | pmtmr } 365 Format: { pit | tsc | cyclone | pmtmr }
363 366
367 code_bytes [IA32] How many bytes of object code to print in an
368 oops report.
369 Range: 0 - 8192
370 Default: 64
371
364 disable_8254_timer 372 disable_8254_timer
365 enable_8254_timer 373 enable_8254_timer
366 [IA32/X86_64] Disable/Enable interrupt 0 timer routing 374 [IA32/X86_64] Disable/Enable interrupt 0 timer routing
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 5c86ed6f0448..625a21db0c2a 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -180,40 +180,81 @@ PCI
180 pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says. 180 pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says.
181 pci=noacpi Don't use ACPI to set up PCI interrupt routing. 181 pci=noacpi Don't use ACPI to set up PCI interrupt routing.
182 182
183IOMMU 183IOMMU (input/output memory management unit)
184 184
185 iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] 185 Currently four x86-64 PCI-DMA mapping implementations exist:
186 [,forcesac][,fullflush][,nomerge][,noaperture][,calgary] 186
187 size set size of iommu (in bytes) 187 1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all
188 noagp don't initialize the AGP driver and use full aperture. 188 (e.g. because you have < 3 GB memory).
189 off don't use the IOMMU 189 Kernel boot message: "PCI-DMA: Disabling IOMMU"
190 leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) 190
191 memaper[=order] allocate an own aperture over RAM with size 32MB^order. 191 2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU.
192 noforce don't force IOMMU usage. Default. 192 Kernel boot message: "PCI-DMA: using GART IOMMU"
193 force Force IOMMU. 193
194 merge Do SG merging. Implies force (experimental) 194 3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
195 nomerge Don't do SG merging. 195 e.g. if there is no hardware IOMMU in the system and it is need because
196 forcesac For SAC mode for masks <40bits (experimental) 196 you have >3GB memory or told the kernel to us it (iommu=soft))
197 fullflush Flush IOMMU on each allocation (default) 197 Kernel boot message: "PCI-DMA: Using software bounce buffering
198 nofullflush Don't use IOMMU fullflush 198 for IO (SWIOTLB)"
199 allowed overwrite iommu off workarounds for specific chipsets. 199
200 soft Use software bounce buffering (default for Intel machines) 200 4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM
201 noaperture Don't touch the aperture for AGP. 201 pSeries and xSeries servers. This hardware IOMMU supports DMA address
202 allowdac Allow DMA >4GB 202 mapping with memory protection, etc.
203 When off all DMA over >4GB is forced through an IOMMU or bounce 203 Kernel boot message: "PCI-DMA: Using Calgary IOMMU"
204 buffering. 204
205 nodac Forbid DMA >4GB 205 iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>]
206 panic Always panic when IOMMU overflows 206 [,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge]
207 calgary Use the Calgary IOMMU if it is available 207 [,noaperture][,calgary]
208 208
209 swiotlb=pages[,force] 209 General iommu options:
210 210 off Don't initialize and use any kind of IOMMU.
211 pages Prereserve that many 128K pages for the software IO bounce buffering. 211 noforce Don't force hardware IOMMU usage when it is not needed.
212 force Force all IO through the software TLB. 212 (default).
213 213 force Force the use of the hardware IOMMU even when it is
214 calgary=[64k,128k,256k,512k,1M,2M,4M,8M] 214 not actually needed (e.g. because < 3 GB memory).
215 calgary=[translate_empty_slots] 215 soft Use software bounce buffering (SWIOTLB) (default for
216 calgary=[disable=<PCI bus number>] 216 Intel machines). This can be used to prevent the usage
217 of an available hardware IOMMU.
218
219 iommu options only relevant to the AMD GART hardware IOMMU:
220 <size> Set the size of the remapping area in bytes.
221 allowed Overwrite iommu off workarounds for specific chipsets.
222 fullflush Flush IOMMU on each allocation (default).
223 nofullflush Don't use IOMMU fullflush.
224 leak Turn on simple iommu leak tracing (only when
225 CONFIG_IOMMU_LEAK is on). Default number of leak pages
226 is 20.
227 memaper[=<order>] Allocate an own aperture over RAM with size 32MB<<order.
228 (default: order=1, i.e. 64MB)
229 merge Do scatter-gather (SG) merging. Implies "force"
230 (experimental).
231 nomerge Don't do scatter-gather (SG) merging.
232 noaperture Ask the IOMMU not to touch the aperture for AGP.
233 forcesac Force single-address cycle (SAC) mode for masks <40bits
234 (experimental).
235 noagp Don't initialize the AGP driver and use full aperture.
236 allowdac Allow double-address cycle (DAC) mode, i.e. DMA >4GB.
237 DAC is used with 32-bit PCI to push a 64-bit address in
238 two cycles. When off all DMA over >4GB is forced through
239 an IOMMU or software bounce buffering.
240 nodac Forbid DAC mode, i.e. DMA >4GB.
241 panic Always panic when IOMMU overflows.
242 calgary Use the Calgary IOMMU if it is available
243
244 iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
245 implementation:
246 swiotlb=<pages>[,force]
247 <pages> Prereserve that many 128K pages for the software IO
248 bounce buffering.
249 force Force all IO through the software TLB.
250
251 Settings for the IBM Calgary hardware IOMMU currently found in IBM
252 pSeries and xSeries machines:
253
254 calgary=[64k,128k,256k,512k,1M,2M,4M,8M]
255 calgary=[translate_empty_slots]
256 calgary=[disable=<PCI bus number>]
257 panic Always panic when IOMMU overflows
217 258
218 64k,...,8M - Set the size of each PCI slot's translation table 259 64k,...,8M - Set the size of each PCI slot's translation table
219 when using the Calgary IOMMU. This is the size of the translation 260 when using the Calgary IOMMU. This is the size of the translation
@@ -234,14 +275,14 @@ IOMMU
234 275
235Debugging 276Debugging
236 277
237 oops=panic Always panic on oopses. Default is to just kill the process, 278 oops=panic Always panic on oopses. Default is to just kill the process,
238 but there is a small probability of deadlocking the machine. 279 but there is a small probability of deadlocking the machine.
239 This will also cause panics on machine check exceptions. 280 This will also cause panics on machine check exceptions.
240 Useful together with panic=30 to trigger a reboot. 281 Useful together with panic=30 to trigger a reboot.
241 282
242 kstack=N Print that many words from the kernel stack in oops dumps. 283 kstack=N Print N words from the kernel stack in oops dumps.
243 284
244 pagefaulttrace Dump all page faults. Only useful for extreme debugging 285 pagefaulttrace Dump all page faults. Only useful for extreme debugging
245 and will create a lot of output. 286 and will create a lot of output.
246 287
247 call_trace=[old|both|newfallback|new] 288 call_trace=[old|both|newfallback|new]
@@ -251,15 +292,8 @@ Debugging
251 newfallback: use new unwinder but fall back to old if it gets 292 newfallback: use new unwinder but fall back to old if it gets
252 stuck (default) 293 stuck (default)
253 294
254 call_trace=[old|both|newfallback|new] 295Miscellaneous
255 old: use old inexact backtracer
256 new: use new exact dwarf2 unwinder
257 both: print entries from both
258 newfallback: use new unwinder but fall back to old if it gets
259 stuck (default)
260
261Misc
262 296
263 noreplacement Don't replace instructions with more appropriate ones 297 noreplacement Don't replace instructions with more appropriate ones
264 for the CPU. This may be useful on asymmetric MP systems 298 for the CPU. This may be useful on asymmetric MP systems
265 where some CPU have less capabilities than the others. 299 where some CPUs have less capabilities than others.
diff --git a/Documentation/x86_64/cpu-hotplug-spec b/Documentation/x86_64/cpu-hotplug-spec
index 5c0fa345e556..3c23e0587db3 100644
--- a/Documentation/x86_64/cpu-hotplug-spec
+++ b/Documentation/x86_64/cpu-hotplug-spec
@@ -2,7 +2,7 @@ Firmware support for CPU hotplug under Linux/x86-64
2--------------------------------------------------- 2---------------------------------------------------
3 3
4Linux/x86-64 supports CPU hotplug now. For various reasons Linux wants to 4Linux/x86-64 supports CPU hotplug now. For various reasons Linux wants to
5know in advance boot time the maximum number of CPUs that could be plugged 5know in advance of boot time the maximum number of CPUs that could be plugged
6into the system. ACPI 3.0 currently has no official way to supply 6into the system. ACPI 3.0 currently has no official way to supply
7this information from the firmware to the operating system. 7this information from the firmware to the operating system.
8 8
diff --git a/Documentation/x86_64/kernel-stacks b/Documentation/x86_64/kernel-stacks
index bddfddd466ab..5ad65d51fb95 100644
--- a/Documentation/x86_64/kernel-stacks
+++ b/Documentation/x86_64/kernel-stacks
@@ -9,9 +9,9 @@ zombie. While the thread is in user space the kernel stack is empty
9except for the thread_info structure at the bottom. 9except for the thread_info structure at the bottom.
10 10
11In addition to the per thread stacks, there are specialized stacks 11In addition to the per thread stacks, there are specialized stacks
12associated with each cpu. These stacks are only used while the kernel 12associated with each CPU. These stacks are only used while the kernel
13is in control on that cpu, when a cpu returns to user space the 13is in control on that CPU; when a CPU returns to user space the
14specialized stacks contain no useful data. The main cpu stacks is 14specialized stacks contain no useful data. The main CPU stacks are:
15 15
16* Interrupt stack. IRQSTACKSIZE 16* Interrupt stack. IRQSTACKSIZE
17 17
@@ -32,17 +32,17 @@ x86_64 also has a feature which is not available on i386, the ability
32to automatically switch to a new stack for designated events such as 32to automatically switch to a new stack for designated events such as
33double fault or NMI, which makes it easier to handle these unusual 33double fault or NMI, which makes it easier to handle these unusual
34events on x86_64. This feature is called the Interrupt Stack Table 34events on x86_64. This feature is called the Interrupt Stack Table
35(IST). There can be up to 7 IST entries per cpu. The IST code is an 35(IST). There can be up to 7 IST entries per CPU. The IST code is an
36index into the Task State Segment (TSS), the IST entries in the TSS 36index into the Task State Segment (TSS). The IST entries in the TSS
37point to dedicated stacks, each stack can be a different size. 37point to dedicated stacks; each stack can be a different size.
38 38
39An IST is selected by an non-zero value in the IST field of an 39An IST is selected by a non-zero value in the IST field of an
40interrupt-gate descriptor. When an interrupt occurs and the hardware 40interrupt-gate descriptor. When an interrupt occurs and the hardware
41loads such a descriptor, the hardware automatically sets the new stack 41loads such a descriptor, the hardware automatically sets the new stack
42pointer based on the IST value, then invokes the interrupt handler. If 42pointer based on the IST value, then invokes the interrupt handler. If
43software wants to allow nested IST interrupts then the handler must 43software wants to allow nested IST interrupts then the handler must
44adjust the IST values on entry to and exit from the interrupt handler. 44adjust the IST values on entry to and exit from the interrupt handler.
45(this is occasionally done, e.g. for debug exceptions) 45(This is occasionally done, e.g. for debug exceptions.)
46 46
47Events with different IST codes (i.e. with different stacks) can be 47Events with different IST codes (i.e. with different stacks) can be
48nested. For example, a debug interrupt can safely be interrupted by an 48nested. For example, a debug interrupt can safely be interrupted by an
@@ -58,17 +58,17 @@ The currently assigned IST stacks are :-
58 58
59 Used for interrupt 12 - Stack Fault Exception (#SS). 59 Used for interrupt 12 - Stack Fault Exception (#SS).
60 60
61 This allows to recover from invalid stack segments. Rarely 61 This allows the CPU to recover from invalid stack segments. Rarely
62 happens. 62 happens.
63 63
64* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). 64* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE).
65 65
66 Used for interrupt 8 - Double Fault Exception (#DF). 66 Used for interrupt 8 - Double Fault Exception (#DF).
67 67
68 Invoked when handling a exception causes another exception. Happens 68 Invoked when handling one exception causes another exception. Happens
69 when the kernel is very confused (e.g. kernel stack pointer corrupt) 69 when the kernel is very confused (e.g. kernel stack pointer corrupt).
70 Using a separate stack allows to recover from it well enough in many 70 Using a separate stack allows the kernel to recover from it well enough
71 cases to still output an oops. 71 in many cases to still output an oops.
72 72
73* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). 73* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE).
74 74
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck
new file mode 100644
index 000000000000..068a6d9904b9
--- /dev/null
+++ b/Documentation/x86_64/machinecheck
@@ -0,0 +1,70 @@
1
2Configurable sysfs parameters for the x86-64 machine check code.
3
4Machine checks report internal hardware error conditions detected
5by the CPU. Uncorrected errors typically cause a machine check
6(often with panic), corrected ones cause a machine check log entry.
7
8Machine checks are organized in banks (normally associated with
9a hardware subsystem) and subevents in a bank. The exact meaning
10of the banks and subevent is CPU specific.
11
12mcelog knows how to decode them.
13
14When you see the "Machine check errors logged" message in the system
15log then mcelog should run to collect and decode machine check entries
16from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
17
18Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
19(N = CPU number)
20
21The directory contains some configurable entries:
22
23Entries:
24
25bankNctl
26(N bank number)
27 64bit Hex bitmask enabling/disabling specific subevents for bank N
28 When a bit in the bitmask is zero then the respective
29 subevent will not be reported.
30 By default all events are enabled.
31 Note that BIOS maintain another mask to disable specific events
32 per bank. This is not visible here
33
34The following entries appear for each CPU, but they are truly shared
35between all CPUs.
36
37check_interval
38 How often to poll for corrected machine check errors, in seconds
39 (Note output is hexademical). Default 5 minutes.
40
41tolerant
42 Tolerance level. When a machine check exception occurs for a non
43 corrected machine check the kernel can take different actions.
44 Since machine check exceptions can happen any time it is sometimes
45 risky for the kernel to kill a process because it defies
46 normal kernel locking rules. The tolerance level configures
47 how hard the kernel tries to recover even at some risk of deadlock.
48
49 0: always panic,
50 1: panic if deadlock possible,
51 2: try to avoid panic,
52 3: never panic or exit (for testing only)
53
54 Default: 1
55
56 Note this only makes a difference if the CPU allows recovery
57 from a machine check exception. Current x86 CPUs generally do not.
58
59trigger
60 Program to run when a machine check event is detected.
61 This is an alternative to running mcelog regularly from cron
62 and allows to detect events faster.
63
64TBD document entries for AMD threshold interrupt configuration
65
66For more details about the x86 machine check architecture
67see the Intel and AMD architecture manuals from their developer websites.
68
69For more details about the architecture see
70see http://one.firstfloor.org/~andi/mce.pdf
diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt
index 133561b9cb0c..f42798ed1c54 100644
--- a/Documentation/x86_64/mm.txt
+++ b/Documentation/x86_64/mm.txt
@@ -3,26 +3,26 @@
3 3
4Virtual memory map with 4 level page tables: 4Virtual memory map with 4 level page tables:
5 5
60000000000000000 - 00007fffffffffff (=47bits) user space, different per mm 60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
7hole caused by [48:63] sign extension 7hole caused by [48:63] sign extension
8ffff800000000000 - ffff80ffffffffff (=40bits) guard hole 8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
9ffff810000000000 - ffffc0ffffffffff (=46bits) direct mapping of all phys. memory 9ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory
10ffffc10000000000 - ffffc1ffffffffff (=40bits) hole 10ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
11ffffc20000000000 - ffffe1ffffffffff (=45bits) vmalloc/ioremap space 11ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
12... unused hole ... 12... unused hole ...
13ffffffff80000000 - ffffffff82800000 (=40MB) kernel text mapping, from phys 0 13ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0
14... unused hole ... 14... unused hole ...
15ffffffff88000000 - fffffffffff00000 (=1919MB) module mapping space 15ffffffff88000000 - fffffffffff00000 (=1919 MB) module mapping space
16 16
17The direct mapping covers all memory in the system upto the highest 17The direct mapping covers all memory in the system up to the highest
18memory address (this means in some cases it can also include PCI memory 18memory address (this means in some cases it can also include PCI memory
19holes) 19holes).
20 20
21vmalloc space is lazily synchronized into the different PML4 pages of 21vmalloc space is lazily synchronized into the different PML4 pages of
22the processes using the page fault handler, with init_level4_pgt as 22the processes using the page fault handler, with init_level4_pgt as
23reference. 23reference.
24 24
25Current X86-64 implementations only support 40 bit of address space, 25Current X86-64 implementations only support 40 bits of address space,
26but we support upto 46bits. This expands into MBZ space in the page tables. 26but we support up to 46 bits. This expands into MBZ space in the page tables.
27 27
28-Andi Kleen, Jul 2004 28-Andi Kleen, Jul 2004
diff --git a/MAINTAINERS b/MAINTAINERS
index 9e6c9ff0f543..b0fd71b3f66f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3779,6 +3779,7 @@ P: Andi Kleen
3779M: ak@suse.de 3779M: ak@suse.de
3780L: discuss@x86-64.org 3780L: discuss@x86-64.org
3781W: http://www.x86-64.org 3781W: http://www.x86-64.org
3782T: quilt ftp://ftp.firstfloor.org/pub/ak/x86_64/quilt-current
3782S: Maintained 3783S: Maintained
3783 3784
3784YAM DRIVER FOR AX.25 3785YAM DRIVER FOR AX.25
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 63d5e841caf5..595fb771366e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -203,6 +203,15 @@ config PARAVIRT
203 However, when run without a hypervisor the kernel is 203 However, when run without a hypervisor the kernel is
204 theoretically slower. If in doubt, say N. 204 theoretically slower. If in doubt, say N.
205 205
206config VMI
207 bool "VMI Paravirt-ops support"
208 depends on PARAVIRT
209 default y
210 help
211 VMI provides a paravirtualized interface to multiple hypervisors
212 include VMware ESX server and Xen by connecting to a ROM module
213 provided by the hypervisor.
214
206config ACPI_SRAT 215config ACPI_SRAT
207 bool 216 bool
208 default y 217 default y
@@ -1263,3 +1272,12 @@ config X86_TRAMPOLINE
1263config KTIME_SCALAR 1272config KTIME_SCALAR
1264 bool 1273 bool
1265 default y 1274 default y
1275
1276config NO_IDLE_HZ
1277 bool
1278 depends on PARAVIRT
1279 default y
1280 help
1281 Switches the regular HZ timer off when the system is going idle.
1282 This helps a hypervisor detect that the Linux system is idle,
1283 reducing the overhead of idle systems.
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 2aecfba4ac4f..b99c0e2a4e63 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -226,11 +226,6 @@ config X86_CMPXCHG
226 depends on !M386 226 depends on !M386
227 default y 227 default y
228 228
229config X86_XADD
230 bool
231 depends on !M386
232 default y
233
234config X86_L1_CACHE_SHIFT 229config X86_L1_CACHE_SHIFT
235 int 230 int
236 default "7" if MPENTIUM4 || X86_GENERIC 231 default "7" if MPENTIUM4 || X86_GENERIC
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index f68cc6f215f8..458bc1611933 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -87,7 +87,7 @@ config DOUBLEFAULT
87 87
88config DEBUG_PARAVIRT 88config DEBUG_PARAVIRT
89 bool "Enable some paravirtualization debugging" 89 bool "Enable some paravirtualization debugging"
90 default y 90 default n
91 depends on PARAVIRT && DEBUG_KERNEL 91 depends on PARAVIRT && DEBUG_KERNEL
92 help 92 help
93 Currently deliberately clobbers regs which are allowed to be 93 Currently deliberately clobbers regs which are allowed to be
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index bb0c376b62b3..5ae1e0bc8fd7 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.20-rc3 3# Linux kernel version: 2.6.20-git8
4# Fri Jan 5 11:54:46 2007 4# Tue Feb 13 11:25:18 2007
5# 5#
6CONFIG_X86_32=y 6CONFIG_X86_32=y
7CONFIG_GENERIC_TIME=y 7CONFIG_GENERIC_TIME=y
@@ -10,6 +10,7 @@ CONFIG_STACKTRACE_SUPPORT=y
10CONFIG_SEMAPHORE_SLEEPERS=y 10CONFIG_SEMAPHORE_SLEEPERS=y
11CONFIG_X86=y 11CONFIG_X86=y
12CONFIG_MMU=y 12CONFIG_MMU=y
13CONFIG_ZONE_DMA=y
13CONFIG_GENERIC_ISA_DMA=y 14CONFIG_GENERIC_ISA_DMA=y
14CONFIG_GENERIC_IOMAP=y 15CONFIG_GENERIC_IOMAP=y
15CONFIG_GENERIC_BUG=y 16CONFIG_GENERIC_BUG=y
@@ -139,7 +140,6 @@ CONFIG_MPENTIUMIII=y
139# CONFIG_MVIAC3_2 is not set 140# CONFIG_MVIAC3_2 is not set
140CONFIG_X86_GENERIC=y 141CONFIG_X86_GENERIC=y
141CONFIG_X86_CMPXCHG=y 142CONFIG_X86_CMPXCHG=y
142CONFIG_X86_XADD=y
143CONFIG_X86_L1_CACHE_SHIFT=7 143CONFIG_X86_L1_CACHE_SHIFT=7
144CONFIG_RWSEM_XCHGADD_ALGORITHM=y 144CONFIG_RWSEM_XCHGADD_ALGORITHM=y
145# CONFIG_ARCH_HAS_ILOG2_U32 is not set 145# CONFIG_ARCH_HAS_ILOG2_U32 is not set
@@ -198,6 +198,7 @@ CONFIG_FLAT_NODE_MEM_MAP=y
198# CONFIG_SPARSEMEM_STATIC is not set 198# CONFIG_SPARSEMEM_STATIC is not set
199CONFIG_SPLIT_PTLOCK_CPUS=4 199CONFIG_SPLIT_PTLOCK_CPUS=4
200CONFIG_RESOURCES_64BIT=y 200CONFIG_RESOURCES_64BIT=y
201CONFIG_ZONE_DMA_FLAG=1
201# CONFIG_HIGHPTE is not set 202# CONFIG_HIGHPTE is not set
202# CONFIG_MATH_EMULATION is not set 203# CONFIG_MATH_EMULATION is not set
203CONFIG_MTRR=y 204CONFIG_MTRR=y
@@ -211,6 +212,7 @@ CONFIG_HZ_250=y
211CONFIG_HZ=250 212CONFIG_HZ=250
212# CONFIG_KEXEC is not set 213# CONFIG_KEXEC is not set
213# CONFIG_CRASH_DUMP is not set 214# CONFIG_CRASH_DUMP is not set
215CONFIG_PHYSICAL_START=0x100000
214# CONFIG_RELOCATABLE is not set 216# CONFIG_RELOCATABLE is not set
215CONFIG_PHYSICAL_ALIGN=0x100000 217CONFIG_PHYSICAL_ALIGN=0x100000
216# CONFIG_HOTPLUG_CPU is not set 218# CONFIG_HOTPLUG_CPU is not set
@@ -229,13 +231,14 @@ CONFIG_PM_SYSFS_DEPRECATED=y
229# ACPI (Advanced Configuration and Power Interface) Support 231# ACPI (Advanced Configuration and Power Interface) Support
230# 232#
231CONFIG_ACPI=y 233CONFIG_ACPI=y
234CONFIG_ACPI_PROCFS=y
232CONFIG_ACPI_AC=y 235CONFIG_ACPI_AC=y
233CONFIG_ACPI_BATTERY=y 236CONFIG_ACPI_BATTERY=y
234CONFIG_ACPI_BUTTON=y 237CONFIG_ACPI_BUTTON=y
235# CONFIG_ACPI_VIDEO is not set
236# CONFIG_ACPI_HOTKEY is not set 238# CONFIG_ACPI_HOTKEY is not set
237CONFIG_ACPI_FAN=y 239CONFIG_ACPI_FAN=y
238# CONFIG_ACPI_DOCK is not set 240# CONFIG_ACPI_DOCK is not set
241# CONFIG_ACPI_BAY is not set
239CONFIG_ACPI_PROCESSOR=y 242CONFIG_ACPI_PROCESSOR=y
240CONFIG_ACPI_THERMAL=y 243CONFIG_ACPI_THERMAL=y
241# CONFIG_ACPI_ASUS is not set 244# CONFIG_ACPI_ASUS is not set
@@ -306,7 +309,6 @@ CONFIG_PCI_DIRECT=y
306CONFIG_PCI_MMCONFIG=y 309CONFIG_PCI_MMCONFIG=y
307# CONFIG_PCIEPORTBUS is not set 310# CONFIG_PCIEPORTBUS is not set
308CONFIG_PCI_MSI=y 311CONFIG_PCI_MSI=y
309# CONFIG_PCI_MULTITHREAD_PROBE is not set
310# CONFIG_PCI_DEBUG is not set 312# CONFIG_PCI_DEBUG is not set
311# CONFIG_HT_IRQ is not set 313# CONFIG_HT_IRQ is not set
312CONFIG_ISA_DMA_API=y 314CONFIG_ISA_DMA_API=y
@@ -347,6 +349,7 @@ CONFIG_UNIX=y
347CONFIG_XFRM=y 349CONFIG_XFRM=y
348# CONFIG_XFRM_USER is not set 350# CONFIG_XFRM_USER is not set
349# CONFIG_XFRM_SUB_POLICY is not set 351# CONFIG_XFRM_SUB_POLICY is not set
352# CONFIG_XFRM_MIGRATE is not set
350# CONFIG_NET_KEY is not set 353# CONFIG_NET_KEY is not set
351CONFIG_INET=y 354CONFIG_INET=y
352CONFIG_IP_MULTICAST=y 355CONFIG_IP_MULTICAST=y
@@ -446,6 +449,7 @@ CONFIG_STANDALONE=y
446CONFIG_PREVENT_FIRMWARE_BUILD=y 449CONFIG_PREVENT_FIRMWARE_BUILD=y
447CONFIG_FW_LOADER=y 450CONFIG_FW_LOADER=y
448# CONFIG_DEBUG_DRIVER is not set 451# CONFIG_DEBUG_DRIVER is not set
452# CONFIG_DEBUG_DEVRES is not set
449# CONFIG_SYS_HYPERVISOR is not set 453# CONFIG_SYS_HYPERVISOR is not set
450 454
451# 455#
@@ -466,8 +470,7 @@ CONFIG_FW_LOADER=y
466# 470#
467# Plug and Play support 471# Plug and Play support
468# 472#
469CONFIG_PNP=y 473# CONFIG_PNP is not set
470CONFIG_PNPACPI=y
471 474
472# 475#
473# Block devices 476# Block devices
@@ -515,6 +518,7 @@ CONFIG_BLK_DEV_IDECD=y
515# CONFIG_BLK_DEV_IDETAPE is not set 518# CONFIG_BLK_DEV_IDETAPE is not set
516# CONFIG_BLK_DEV_IDEFLOPPY is not set 519# CONFIG_BLK_DEV_IDEFLOPPY is not set
517# CONFIG_BLK_DEV_IDESCSI is not set 520# CONFIG_BLK_DEV_IDESCSI is not set
521CONFIG_BLK_DEV_IDEACPI=y
518# CONFIG_IDE_TASK_IOCTL is not set 522# CONFIG_IDE_TASK_IOCTL is not set
519 523
520# 524#
@@ -547,6 +551,7 @@ CONFIG_BLK_DEV_AMD74XX=y
547# CONFIG_BLK_DEV_JMICRON is not set 551# CONFIG_BLK_DEV_JMICRON is not set
548# CONFIG_BLK_DEV_SC1200 is not set 552# CONFIG_BLK_DEV_SC1200 is not set
549CONFIG_BLK_DEV_PIIX=y 553CONFIG_BLK_DEV_PIIX=y
554# CONFIG_BLK_DEV_IT8213 is not set
550# CONFIG_BLK_DEV_IT821X is not set 555# CONFIG_BLK_DEV_IT821X is not set
551# CONFIG_BLK_DEV_NS87415 is not set 556# CONFIG_BLK_DEV_NS87415 is not set
552# CONFIG_BLK_DEV_PDC202XX_OLD is not set 557# CONFIG_BLK_DEV_PDC202XX_OLD is not set
@@ -557,6 +562,7 @@ CONFIG_BLK_DEV_PIIX=y
557# CONFIG_BLK_DEV_SLC90E66 is not set 562# CONFIG_BLK_DEV_SLC90E66 is not set
558# CONFIG_BLK_DEV_TRM290 is not set 563# CONFIG_BLK_DEV_TRM290 is not set
559# CONFIG_BLK_DEV_VIA82CXXX is not set 564# CONFIG_BLK_DEV_VIA82CXXX is not set
565# CONFIG_BLK_DEV_TC86C001 is not set
560# CONFIG_IDE_ARM is not set 566# CONFIG_IDE_ARM is not set
561CONFIG_BLK_DEV_IDEDMA=y 567CONFIG_BLK_DEV_IDEDMA=y
562# CONFIG_IDEDMA_IVB is not set 568# CONFIG_IDEDMA_IVB is not set
@@ -655,6 +661,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
655# Serial ATA (prod) and Parallel ATA (experimental) drivers 661# Serial ATA (prod) and Parallel ATA (experimental) drivers
656# 662#
657CONFIG_ATA=y 663CONFIG_ATA=y
664# CONFIG_ATA_NONSTANDARD is not set
658CONFIG_SATA_AHCI=y 665CONFIG_SATA_AHCI=y
659CONFIG_SATA_SVW=y 666CONFIG_SATA_SVW=y
660CONFIG_ATA_PIIX=y 667CONFIG_ATA_PIIX=y
@@ -670,6 +677,7 @@ CONFIG_SATA_SIL=y
670# CONFIG_SATA_ULI is not set 677# CONFIG_SATA_ULI is not set
671CONFIG_SATA_VIA=y 678CONFIG_SATA_VIA=y
672# CONFIG_SATA_VITESSE is not set 679# CONFIG_SATA_VITESSE is not set
680# CONFIG_SATA_INIC162X is not set
673CONFIG_SATA_INTEL_COMBINED=y 681CONFIG_SATA_INTEL_COMBINED=y
674# CONFIG_PATA_ALI is not set 682# CONFIG_PATA_ALI is not set
675# CONFIG_PATA_AMD is not set 683# CONFIG_PATA_AMD is not set
@@ -687,6 +695,7 @@ CONFIG_SATA_INTEL_COMBINED=y
687# CONFIG_PATA_HPT3X2N is not set 695# CONFIG_PATA_HPT3X2N is not set
688# CONFIG_PATA_HPT3X3 is not set 696# CONFIG_PATA_HPT3X3 is not set
689# CONFIG_PATA_IT821X is not set 697# CONFIG_PATA_IT821X is not set
698# CONFIG_PATA_IT8213 is not set
690# CONFIG_PATA_JMICRON is not set 699# CONFIG_PATA_JMICRON is not set
691# CONFIG_PATA_TRIFLEX is not set 700# CONFIG_PATA_TRIFLEX is not set
692# CONFIG_PATA_MARVELL is not set 701# CONFIG_PATA_MARVELL is not set
@@ -739,9 +748,7 @@ CONFIG_IEEE1394=y
739# Subsystem Options 748# Subsystem Options
740# 749#
741# CONFIG_IEEE1394_VERBOSEDEBUG is not set 750# CONFIG_IEEE1394_VERBOSEDEBUG is not set
742# CONFIG_IEEE1394_OUI_DB is not set
743# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set 751# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
744# CONFIG_IEEE1394_EXPORT_FULL_API is not set
745 752
746# 753#
747# Device Drivers 754# Device Drivers
@@ -767,6 +774,11 @@ CONFIG_IEEE1394_RAWIO=y
767# CONFIG_I2O is not set 774# CONFIG_I2O is not set
768 775
769# 776#
777# Macintosh device drivers
778#
779# CONFIG_MAC_EMUMOUSEBTN is not set
780
781#
770# Network device support 782# Network device support
771# 783#
772CONFIG_NETDEVICES=y 784CONFIG_NETDEVICES=y
@@ -833,6 +845,7 @@ CONFIG_8139TOO=y
833# CONFIG_SUNDANCE is not set 845# CONFIG_SUNDANCE is not set
834# CONFIG_TLAN is not set 846# CONFIG_TLAN is not set
835# CONFIG_VIA_RHINE is not set 847# CONFIG_VIA_RHINE is not set
848# CONFIG_SC92031 is not set
836 849
837# 850#
838# Ethernet (1000 Mbit) 851# Ethernet (1000 Mbit)
@@ -855,11 +868,13 @@ CONFIG_SKY2=y
855CONFIG_TIGON3=y 868CONFIG_TIGON3=y
856CONFIG_BNX2=y 869CONFIG_BNX2=y
857# CONFIG_QLA3XXX is not set 870# CONFIG_QLA3XXX is not set
871# CONFIG_ATL1 is not set
858 872
859# 873#
860# Ethernet (10000 Mbit) 874# Ethernet (10000 Mbit)
861# 875#
862# CONFIG_CHELSIO_T1 is not set 876# CONFIG_CHELSIO_T1 is not set
877# CONFIG_CHELSIO_T3 is not set
863# CONFIG_IXGB is not set 878# CONFIG_IXGB is not set
864# CONFIG_S2IO is not set 879# CONFIG_S2IO is not set
865# CONFIG_MYRI10GE is not set 880# CONFIG_MYRI10GE is not set
@@ -1090,6 +1105,7 @@ CONFIG_SOUND=y
1090# Open Sound System 1105# Open Sound System
1091# 1106#
1092CONFIG_SOUND_PRIME=y 1107CONFIG_SOUND_PRIME=y
1108CONFIG_OBSOLETE_OSS=y
1093# CONFIG_SOUND_BT878 is not set 1109# CONFIG_SOUND_BT878 is not set
1094# CONFIG_SOUND_ES1371 is not set 1110# CONFIG_SOUND_ES1371 is not set
1095CONFIG_SOUND_ICH=y 1111CONFIG_SOUND_ICH=y
@@ -1103,6 +1119,7 @@ CONFIG_SOUND_ICH=y
1103# HID Devices 1119# HID Devices
1104# 1120#
1105CONFIG_HID=y 1121CONFIG_HID=y
1122# CONFIG_HID_DEBUG is not set
1106 1123
1107# 1124#
1108# USB support 1125# USB support
@@ -1117,10 +1134,8 @@ CONFIG_USB=y
1117# Miscellaneous USB options 1134# Miscellaneous USB options
1118# 1135#
1119CONFIG_USB_DEVICEFS=y 1136CONFIG_USB_DEVICEFS=y
1120# CONFIG_USB_BANDWIDTH is not set
1121# CONFIG_USB_DYNAMIC_MINORS is not set 1137# CONFIG_USB_DYNAMIC_MINORS is not set
1122# CONFIG_USB_SUSPEND is not set 1138# CONFIG_USB_SUSPEND is not set
1123# CONFIG_USB_MULTITHREAD_PROBE is not set
1124# CONFIG_USB_OTG is not set 1139# CONFIG_USB_OTG is not set
1125 1140
1126# 1141#
@@ -1130,9 +1145,11 @@ CONFIG_USB_EHCI_HCD=y
1130# CONFIG_USB_EHCI_SPLIT_ISO is not set 1145# CONFIG_USB_EHCI_SPLIT_ISO is not set
1131# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1146# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1132# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1147# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1148# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
1133# CONFIG_USB_ISP116X_HCD is not set 1149# CONFIG_USB_ISP116X_HCD is not set
1134CONFIG_USB_OHCI_HCD=y 1150CONFIG_USB_OHCI_HCD=y
1135# CONFIG_USB_OHCI_BIG_ENDIAN is not set 1151# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
1152# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
1136CONFIG_USB_OHCI_LITTLE_ENDIAN=y 1153CONFIG_USB_OHCI_LITTLE_ENDIAN=y
1137CONFIG_USB_UHCI_HCD=y 1154CONFIG_USB_UHCI_HCD=y
1138# CONFIG_USB_SL811_HCD is not set 1155# CONFIG_USB_SL811_HCD is not set
@@ -1183,6 +1200,7 @@ CONFIG_USB_HID=y
1183# CONFIG_USB_ATI_REMOTE2 is not set 1200# CONFIG_USB_ATI_REMOTE2 is not set
1184# CONFIG_USB_KEYSPAN_REMOTE is not set 1201# CONFIG_USB_KEYSPAN_REMOTE is not set
1185# CONFIG_USB_APPLETOUCH is not set 1202# CONFIG_USB_APPLETOUCH is not set
1203# CONFIG_USB_GTCO is not set
1186 1204
1187# 1205#
1188# USB Imaging devices 1206# USB Imaging devices
@@ -1288,6 +1306,10 @@ CONFIG_USB_MON=y
1288# 1306#
1289 1307
1290# 1308#
1309# Auxiliary Display support
1310#
1311
1312#
1291# Virtualization 1313# Virtualization
1292# 1314#
1293# CONFIG_KVM is not set 1315# CONFIG_KVM is not set
@@ -1480,6 +1502,7 @@ CONFIG_UNUSED_SYMBOLS=y
1480# CONFIG_DEBUG_FS is not set 1502# CONFIG_DEBUG_FS is not set
1481# CONFIG_HEADERS_CHECK is not set 1503# CONFIG_HEADERS_CHECK is not set
1482CONFIG_DEBUG_KERNEL=y 1504CONFIG_DEBUG_KERNEL=y
1505# CONFIG_DEBUG_SHIRQ is not set
1483CONFIG_LOG_BUF_SHIFT=18 1506CONFIG_LOG_BUF_SHIFT=18
1484CONFIG_DETECT_SOFTLOCKUP=y 1507CONFIG_DETECT_SOFTLOCKUP=y
1485# CONFIG_SCHEDSTATS is not set 1508# CONFIG_SCHEDSTATS is not set
@@ -1488,7 +1511,6 @@ CONFIG_DETECT_SOFTLOCKUP=y
1488# CONFIG_RT_MUTEX_TESTER is not set 1511# CONFIG_RT_MUTEX_TESTER is not set
1489# CONFIG_DEBUG_SPINLOCK is not set 1512# CONFIG_DEBUG_SPINLOCK is not set
1490# CONFIG_DEBUG_MUTEXES is not set 1513# CONFIG_DEBUG_MUTEXES is not set
1491# CONFIG_DEBUG_RWSEMS is not set
1492# CONFIG_DEBUG_LOCK_ALLOC is not set 1514# CONFIG_DEBUG_LOCK_ALLOC is not set
1493# CONFIG_PROVE_LOCKING is not set 1515# CONFIG_PROVE_LOCKING is not set
1494# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 1516# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
@@ -1533,7 +1555,8 @@ CONFIG_CRC32=y
1533# CONFIG_LIBCRC32C is not set 1555# CONFIG_LIBCRC32C is not set
1534CONFIG_ZLIB_INFLATE=y 1556CONFIG_ZLIB_INFLATE=y
1535CONFIG_PLIST=y 1557CONFIG_PLIST=y
1536CONFIG_IOMAP_COPY=y 1558CONFIG_HAS_IOMEM=y
1559CONFIG_HAS_IOPORT=y
1537CONFIG_GENERIC_HARDIRQS=y 1560CONFIG_GENERIC_HARDIRQS=y
1538CONFIG_GENERIC_IRQ_PROBE=y 1561CONFIG_GENERIC_IRQ_PROBE=y
1539CONFIG_GENERIC_PENDING_IRQ=y 1562CONFIG_GENERIC_PENDING_IRQ=y
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1e8988e558c5..cbe4e601885c 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,8 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S 43obj-$(CONFIG_VMI) += vmi.o vmitime.o
44obj-$(CONFIG_PARAVIRT) += paravirt.o 44obj-$(CONFIG_PARAVIRT) += paravirt.o
45obj-y += pcspeaker.o
45 46
46EXTRA_AFLAGS := -traditional 47EXTRA_AFLAGS := -traditional
47 48
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be26af9..f4159e0a7ae9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -36,6 +36,7 @@
36#include <asm/hpet.h> 36#include <asm/hpet.h>
37#include <asm/i8253.h> 37#include <asm/i8253.h>
38#include <asm/nmi.h> 38#include <asm/nmi.h>
39#include <asm/idle.h>
39 40
40#include <mach_apic.h> 41#include <mach_apic.h>
41#include <mach_apicdef.h> 42#include <mach_apicdef.h>
@@ -1255,6 +1256,7 @@ fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
1255 * Besides, if we don't timer interrupts ignore the global 1256 * Besides, if we don't timer interrupts ignore the global
1256 * interrupt lock, which is the WrongThing (tm) to do. 1257 * interrupt lock, which is the WrongThing (tm) to do.
1257 */ 1258 */
1259 exit_idle();
1258 irq_enter(); 1260 irq_enter();
1259 smp_local_timer_interrupt(); 1261 smp_local_timer_interrupt();
1260 irq_exit(); 1262 irq_exit();
@@ -1305,6 +1307,7 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs)
1305{ 1307{
1306 unsigned long v; 1308 unsigned long v;
1307 1309
1310 exit_idle();
1308 irq_enter(); 1311 irq_enter();
1309 /* 1312 /*
1310 * Check if this really is a spurious interrupt and ACK it 1313 * Check if this really is a spurious interrupt and ACK it
@@ -1329,6 +1332,7 @@ fastcall void smp_error_interrupt(struct pt_regs *regs)
1329{ 1332{
1330 unsigned long v, v1; 1333 unsigned long v, v1;
1331 1334
1335 exit_idle();
1332 irq_enter(); 1336 irq_enter();
1333 /* First tickle the hardware, only then report what went on. -- REW */ 1337 /* First tickle the hardware, only then report what went on. -- REW */
1334 v = apic_read(APIC_ESR); 1338 v = apic_read(APIC_ESR);
@@ -1395,7 +1399,7 @@ int __init APIC_init_uniprocessor (void)
1395 if (!skip_ioapic_setup && nr_ioapics) 1399 if (!skip_ioapic_setup && nr_ioapics)
1396 setup_IO_APIC(); 1400 setup_IO_APIC();
1397#endif 1401#endif
1398 setup_boot_APIC_clock(); 1402 setup_boot_clock();
1399 1403
1400 return 0; 1404 return 0;
1401} 1405}
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index db99a8948dae..f9ba0af7ee1f 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -211,6 +211,7 @@
211#include <linux/slab.h> 211#include <linux/slab.h>
212#include <linux/stat.h> 212#include <linux/stat.h>
213#include <linux/proc_fs.h> 213#include <linux/proc_fs.h>
214#include <linux/seq_file.h>
214#include <linux/miscdevice.h> 215#include <linux/miscdevice.h>
215#include <linux/apm_bios.h> 216#include <linux/apm_bios.h>
216#include <linux/init.h> 217#include <linux/init.h>
@@ -1636,9 +1637,8 @@ static int do_open(struct inode * inode, struct file * filp)
1636 return 0; 1637 return 0;
1637} 1638}
1638 1639
1639static int apm_get_info(char *buf, char **start, off_t fpos, int length) 1640static int proc_apm_show(struct seq_file *m, void *v)
1640{ 1641{
1641 char * p;
1642 unsigned short bx; 1642 unsigned short bx;
1643 unsigned short cx; 1643 unsigned short cx;
1644 unsigned short dx; 1644 unsigned short dx;
@@ -1650,8 +1650,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1650 int time_units = -1; 1650 int time_units = -1;
1651 char *units = "?"; 1651 char *units = "?";
1652 1652
1653 p = buf;
1654
1655 if ((num_online_cpus() == 1) && 1653 if ((num_online_cpus() == 1) &&
1656 !(error = apm_get_power_status(&bx, &cx, &dx))) { 1654 !(error = apm_get_power_status(&bx, &cx, &dx))) {
1657 ac_line_status = (bx >> 8) & 0xff; 1655 ac_line_status = (bx >> 8) & 0xff;
@@ -1705,7 +1703,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1705 -1: Unknown 1703 -1: Unknown
1706 8) min = minutes; sec = seconds */ 1704 8) min = minutes; sec = seconds */
1707 1705
1708 p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", 1706 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1709 driver_version, 1707 driver_version,
1710 (apm_info.bios.version >> 8) & 0xff, 1708 (apm_info.bios.version >> 8) & 0xff,
1711 apm_info.bios.version & 0xff, 1709 apm_info.bios.version & 0xff,
@@ -1716,10 +1714,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1716 percentage, 1714 percentage,
1717 time_units, 1715 time_units,
1718 units); 1716 units);
1717 return 0;
1718}
1719 1719
1720 return p - buf; 1720static int proc_apm_open(struct inode *inode, struct file *file)
1721{
1722 return single_open(file, proc_apm_show, NULL);
1721} 1723}
1722 1724
1725static const struct file_operations apm_file_ops = {
1726 .owner = THIS_MODULE,
1727 .open = proc_apm_open,
1728 .read = seq_read,
1729 .llseek = seq_lseek,
1730 .release = single_release,
1731};
1732
1723static int apm(void *unused) 1733static int apm(void *unused)
1724{ 1734{
1725 unsigned short bx; 1735 unsigned short bx;
@@ -2341,9 +2351,9 @@ static int __init apm_init(void)
2341 set_base(gdt[APM_DS >> 3], 2351 set_base(gdt[APM_DS >> 3],
2342 __va((unsigned long)apm_info.bios.dseg << 4)); 2352 __va((unsigned long)apm_info.bios.dseg << 4));
2343 2353
2344 apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); 2354 apm_proc = create_proc_entry("apm", 0, NULL);
2345 if (apm_proc) 2355 if (apm_proc)
2346 apm_proc->owner = THIS_MODULE; 2356 apm_proc->proc_fops = &apm_file_ops;
2347 2357
2348 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2358 kapmd_task = kthread_create(apm, NULL, "kapmd");
2349 if (IS_ERR(kapmd_task)) { 2359 if (IS_ERR(kapmd_task)) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1b2f3cd33270..c37535163bfc 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
72 OFFSET(PT_EAX, pt_regs, eax); 72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds); 73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes); 74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs); 75 OFFSET(PT_FS, pt_regs, xfs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); 76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip); 77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs); 78 OFFSET(PT_CS, pt_regs, xcs);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 8a8bbdaaf38a..dcbbd0a8bfc2 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
605struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
606{ 606{
607 memset(regs, 0, sizeof(struct pt_regs)); 607 memset(regs, 0, sizeof(struct pt_regs));
608 regs->xgs = __KERNEL_PDA; 608 regs->xfs = __KERNEL_PDA;
609 return regs; 609 return regs;
610} 610}
611 611
@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
662 .pcurrent = &init_task, 662 .pcurrent = &init_task,
663}; 663};
664 664
665static inline void set_kernel_gs(void) 665static inline void set_kernel_fs(void)
666{ 666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a 667 /* Set %fs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler 668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */ 669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); 670 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
671} 671}
672 672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for 673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
718 the boot CPU, this will transition from the boot gdt+pda to 718 the boot CPU, this will transition from the boot gdt+pda to
719 the real ones). */ 719 the real ones). */
720 load_gdt(cpu_gdt_descr); 720 load_gdt(cpu_gdt_descr);
721 set_kernel_gs(); 721 set_kernel_fs();
722} 722}
723 723
724/* Common CPU init for both boot and secondary CPUs */ 724/* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
764 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 764 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
765#endif 765#endif
766 766
767 /* Clear %fs. */ 767 /* Clear %gs. */
768 asm volatile ("mov %0, %%fs" : : "r" (0)); 768 asm volatile ("mov %0, %%gs" : : "r" (0));
769 769
770 /* Clear all 6 debug registers: */ 770 /* Clear all 6 debug registers: */
771 set_debugreg(0, 0); 771 set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index c0c3b59de32c..de27bd07bc9c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -6,6 +6,7 @@
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/timer.h> 8#include <asm/timer.h>
9#include <asm/pci-direct.h>
9 10
10#include "cpu.h" 11#include "cpu.h"
11 12
@@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void)
161static void __cpuinit geode_configure(void) 162static void __cpuinit geode_configure(void)
162{ 163{
163 unsigned long flags; 164 unsigned long flags;
164 u8 ccr3, ccr4; 165 u8 ccr3;
165 local_irq_save(flags); 166 local_irq_save(flags);
166 167
167 /* Suspend on halt power saving and enable #SUSP pin */ 168 /* Suspend on halt power saving and enable #SUSP pin */
168 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 169 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
169 170
170 ccr3 = getCx86(CX86_CCR3); 171 ccr3 = getCx86(CX86_CCR3);
171 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ 172 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
172
173 ccr4 = getCx86(CX86_CCR4);
174 ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */
175 173
176 setCx86(CX86_CCR3, ccr3); 174
175 /* FPU fast, DTE cache, Mem bypass */
176 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
177 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
177 178
178 set_cx86_memwb(); 179 set_cx86_memwb();
179 set_cx86_reorder(); 180 set_cx86_reorder();
@@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void)
183} 184}
184 185
185 186
186#ifdef CONFIG_PCI
187static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
188 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
189 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
190 { },
191};
192#endif
193
194static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 187static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
195{ 188{
196 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; 189 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
@@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
258 251
259 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ 252 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
260#ifdef CONFIG_PCI 253#ifdef CONFIG_PCI
254 {
255 u32 vendor, device;
261 /* It isn't really a PCI quirk directly, but the cure is the 256 /* It isn't really a PCI quirk directly, but the cure is the
262 same. The MediaGX has deep magic SMM stuff that handles the 257 same. The MediaGX has deep magic SMM stuff that handles the
263 SB emulation. It thows away the fifo on disable_dma() which 258 SB emulation. It thows away the fifo on disable_dma() which
@@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
273 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); 268 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
274 isa_dma_bridge_buggy = 2; 269 isa_dma_bridge_buggy = 2;
275 270
271 /* We do this before the PCI layer is running. However we
272 are safe here as we know the bridge must be a Cyrix
273 companion and must be present */
274 vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
275 device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
276 276
277 /* 277 /*
278 * The 5510/5520 companion chips have a funky PIT. 278 * The 5510/5520 companion chips have a funky PIT.
279 */ 279 */
280 if (pci_dev_present(cyrix_55x0)) 280 if (vendor == PCI_VENDOR_ID_CYRIX &&
281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
281 pit_latch_buggy = 1; 282 pit_latch_buggy = 1;
283 }
282#endif 284#endif
283 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ 285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
284 286
285 /* GXm supports extended cpuid levels 'ala' AMD */ 287 /* GXm supports extended cpuid levels 'ala' AMD */
286 if (c->cpuid_level == 2) { 288 if (c->cpuid_level == 2) {
287 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 289 /* Enable cxMMX extensions (GX1 Datasheet 54) */
288 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 290 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
289 291
290 /* GXlv/GXm/GX1 */ 292 /*
291 if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) 293 * GXm : 0x30 ... 0x5f GXm datasheet 51
294 * GXlv: 0x6x GXlv datasheet 54
295 * ? : 0x7x
296 * GX1 : 0x8x GX1 datasheet 56
297 */
298 if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
292 geode_configure(); 299 geode_configure();
293 get_model_name(c); /* get CPU marketing name */ 300 get_model_name(c); /* get CPU marketing name */
294 return; 301 return;
@@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
415 422
416 if (dir0 == 5 || dir0 == 3) 423 if (dir0 == 5 || dir0 == 3)
417 { 424 {
418 unsigned char ccr3, ccr4; 425 unsigned char ccr3;
419 unsigned long flags; 426 unsigned long flags;
420 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); 427 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
421 local_irq_save(flags); 428 local_irq_save(flags);
422 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
423 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
424 ccr4 = getCx86(CX86_CCR4); 431 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */
425 setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
426 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
427 local_irq_restore(flags); 433 local_irq_restore(flags);
428 } 434 }
429 } 435 }
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index d555bec0db99..4f10c62d180c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -12,6 +12,7 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/system.h> 14#include <asm/system.h>
15#include <asm/mce.h>
15 16
16#include "mce.h" 17#include "mce.h"
17 18
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
index 84fd4cf7d0fb..81fb6e2d35f3 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h>
2 3
3void amd_mcheck_init(struct cpuinfo_x86 *c); 4void amd_mcheck_init(struct cpuinfo_x86 *c);
4void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 5void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c);
9/* Call the installed machine check handler for this CPU setup. */ 10/* Call the installed machine check handler for this CPU setup. */
10extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); 11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
11 12
12extern int mce_disabled;
13extern int nr_mce_banks; 13extern int nr_mce_banks;
14 14
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..8359c19d3a23 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/idle.h>
15 16
16#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
17 18
@@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm
59 60
60fastcall void smp_thermal_interrupt(struct pt_regs *regs) 61fastcall void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
63 exit_idle();
62 irq_enter(); 64 irq_enter();
63 vendor_thermal_interrupt(regs); 65 vendor_thermal_interrupt(regs);
64 irq_exit(); 66 irq_exit();
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index ee771f305f96..c7d8f1756745 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
211 default: 211 default:
212 return -ENOTTY; 212 return -ENOTTY;
213 case MTRRIOC_ADD_ENTRY: 213 case MTRRIOC_ADD_ENTRY:
214#ifdef CONFIG_COMPAT
215 case MTRRIOC32_ADD_ENTRY:
216#endif
214 if (!capable(CAP_SYS_ADMIN)) 217 if (!capable(CAP_SYS_ADMIN))
215 return -EPERM; 218 return -EPERM;
216 err = 219 err =
@@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
218 file, 0); 221 file, 0);
219 break; 222 break;
220 case MTRRIOC_SET_ENTRY: 223 case MTRRIOC_SET_ENTRY:
224#ifdef CONFIG_COMPAT
225 case MTRRIOC32_SET_ENTRY:
226#endif
221 if (!capable(CAP_SYS_ADMIN)) 227 if (!capable(CAP_SYS_ADMIN))
222 return -EPERM; 228 return -EPERM;
223 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); 229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
224 break; 230 break;
225 case MTRRIOC_DEL_ENTRY: 231 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT
233 case MTRRIOC32_DEL_ENTRY:
234#endif
226 if (!capable(CAP_SYS_ADMIN)) 235 if (!capable(CAP_SYS_ADMIN))
227 return -EPERM; 236 return -EPERM;
228 err = mtrr_file_del(sentry.base, sentry.size, file, 0); 237 err = mtrr_file_del(sentry.base, sentry.size, file, 0);
229 break; 238 break;
230 case MTRRIOC_KILL_ENTRY: 239 case MTRRIOC_KILL_ENTRY:
240#ifdef CONFIG_COMPAT
241 case MTRRIOC32_KILL_ENTRY:
242#endif
231 if (!capable(CAP_SYS_ADMIN)) 243 if (!capable(CAP_SYS_ADMIN))
232 return -EPERM; 244 return -EPERM;
233 err = mtrr_del(-1, sentry.base, sentry.size); 245 err = mtrr_del(-1, sentry.base, sentry.size);
234 break; 246 break;
235 case MTRRIOC_GET_ENTRY: 247 case MTRRIOC_GET_ENTRY:
248#ifdef CONFIG_COMPAT
249 case MTRRIOC32_GET_ENTRY:
250#endif
236 if (gentry.regnum >= num_var_ranges) 251 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 252 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); 253 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
249 264
250 break; 265 break;
251 case MTRRIOC_ADD_PAGE_ENTRY: 266 case MTRRIOC_ADD_PAGE_ENTRY:
267#ifdef CONFIG_COMPAT
268 case MTRRIOC32_ADD_PAGE_ENTRY:
269#endif
252 if (!capable(CAP_SYS_ADMIN)) 270 if (!capable(CAP_SYS_ADMIN))
253 return -EPERM; 271 return -EPERM;
254 err = 272 err =
@@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
256 file, 1); 274 file, 1);
257 break; 275 break;
258 case MTRRIOC_SET_PAGE_ENTRY: 276 case MTRRIOC_SET_PAGE_ENTRY:
277#ifdef CONFIG_COMPAT
278 case MTRRIOC32_SET_PAGE_ENTRY:
279#endif
259 if (!capable(CAP_SYS_ADMIN)) 280 if (!capable(CAP_SYS_ADMIN))
260 return -EPERM; 281 return -EPERM;
261 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); 282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
262 break; 283 break;
263 case MTRRIOC_DEL_PAGE_ENTRY: 284 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT
286 case MTRRIOC32_DEL_PAGE_ENTRY:
287#endif
264 if (!capable(CAP_SYS_ADMIN)) 288 if (!capable(CAP_SYS_ADMIN))
265 return -EPERM; 289 return -EPERM;
266 err = mtrr_file_del(sentry.base, sentry.size, file, 1); 290 err = mtrr_file_del(sentry.base, sentry.size, file, 1);
267 break; 291 break;
268 case MTRRIOC_KILL_PAGE_ENTRY: 292 case MTRRIOC_KILL_PAGE_ENTRY:
293#ifdef CONFIG_COMPAT
294 case MTRRIOC32_KILL_PAGE_ENTRY:
295#endif
269 if (!capable(CAP_SYS_ADMIN)) 296 if (!capable(CAP_SYS_ADMIN))
270 return -EPERM; 297 return -EPERM;
271 err = mtrr_del_page(-1, sentry.base, sentry.size); 298 err = mtrr_del_page(-1, sentry.base, sentry.size);
272 break; 299 break;
273 case MTRRIOC_GET_PAGE_ENTRY: 300 case MTRRIOC_GET_PAGE_ENTRY:
301#ifdef CONFIG_COMPAT
302 case MTRRIOC32_GET_PAGE_ENTRY:
303#endif
274 if (gentry.regnum >= num_var_ranges) 304 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 305 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); 306 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 16bb7ea87145..0acfb6a5a220 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -50,7 +50,7 @@ u32 num_var_ranges = 0;
50unsigned int *usage_table; 50unsigned int *usage_table;
51static DEFINE_MUTEX(mtrr_mutex); 51static DEFINE_MUTEX(mtrr_mutex);
52 52
53u32 size_or_mask, size_and_mask; 53u64 size_or_mask, size_and_mask;
54 54
55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; 55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
56 56
@@ -662,8 +662,8 @@ void __init mtrr_bp_init(void)
662 boot_cpu_data.x86_mask == 0x4)) 662 boot_cpu_data.x86_mask == 0x4))
663 phys_addr = 36; 663 phys_addr = 36;
664 664
665 size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); 665 size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
666 size_and_mask = ~size_or_mask & 0xfff00000; 666 size_and_mask = ~size_or_mask & 0xfffff00000ULL;
667 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && 667 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
668 boot_cpu_data.x86 == 6) { 668 boot_cpu_data.x86 == 6) {
669 /* VIA C* family have Intel style MTRRs, but 669 /* VIA C* family have Intel style MTRRs, but
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index d61ea9db6cfe..289dfe6030e3 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -84,7 +84,7 @@ void get_mtrr_state(void);
84 84
85extern void set_mtrr_ops(struct mtrr_ops * ops); 85extern void set_mtrr_ops(struct mtrr_ops * ops);
86 86
87extern u32 size_or_mask, size_and_mask; 87extern u64 size_or_mask, size_and_mask;
88extern struct mtrr_ops * mtrr_if; 88extern struct mtrr_ops * mtrr_if;
89 89
90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 6624d8583c42..47e3ebbfb28d 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, 30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, 31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
32 NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", 32 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
33 33
34 /* Transmeta-defined */ 34 /* Transmeta-defined */
35 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, 35 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
47 /* Intel-defined (#2) */ 47 /* Intel-defined (#2) */
48 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", 48 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
49 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, 49 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
50 NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, 50 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
51 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 51 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
52 52
53 /* VIA/Cyrix/Centaur-defined */ 53 /* VIA/Cyrix/Centaur-defined */
@@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
57 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 57 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
58 58
59 /* AMD-defined (#2) */ 59 /* AMD-defined (#2) */
60 "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, 60 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
61 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 61 "sse4a", "misalignsse",
62 "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 64 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64 }; 65 };
@@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
69 "ttp", /* thermal trip */ 70 "ttp", /* thermal trip */
70 "tm", 71 "tm",
71 "stc", 72 "stc",
73 "100mhzsteps",
74 "hwpstate",
72 NULL, 75 NULL,
73 /* nothing */ /* constant_tsc - moved to flags */ 76 NULL, /* constant_tsc - moved to flags */
77 /* nothing */
74 }; 78 };
75 struct cpuinfo_x86 *c = v; 79 struct cpuinfo_x86 *c = v;
76 int i, n = c - cpu_data; 80 int i, n = c - cpu_data;
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 4056fb7d2cdf..5678d46863c6 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 9{
10 unsigned int cap_mask, uk, max, dummy; 10 unsigned int cap_mask, uk, max, dummy;
11 unsigned int cms_rev1, cms_rev2; 11 unsigned int cms_rev1, cms_rev2;
12 unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; 12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 13 char cpu_info[65];
14 14
15 get_model_name(c); /* Same as AMD/Cyrix */ 15 get_model_name(c); /* Same as AMD/Cyrix */
@@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
72 wrmsr(0x80860004, ~0, uk); 72 wrmsr(0x80860004, ~0, uk);
73 c->x86_capability[0] = cpuid_edx(0x00000001); 73 c->x86_capability[0] = cpuid_edx(0x00000001);
74 wrmsr(0x80860004, cap_mask, uk); 74 wrmsr(0x80860004, cap_mask, uk);
75
76 /* All Transmeta CPUs have a constant TSC */
77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
75 78
76 /* If we can run i686 user-space code, call us an i686 */ 79 /* If we can run i686 user-space code, call us an i686 */
77#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) 80#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 4da75fa3208d..eeae0d992337 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -48,7 +48,6 @@ static struct class *cpuid_class;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 49
50struct cpuid_command { 50struct cpuid_command {
51 int cpu;
52 u32 reg; 51 u32 reg;
53 u32 *data; 52 u32 *data;
54}; 53};
@@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block)
57{ 56{
58 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; 57 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
59 58
60 if (cmd->cpu == smp_processor_id()) 59 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
61 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
62 &cmd->data[3]); 60 &cmd->data[3]);
63} 61}
64 62
@@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data)
70 if (cpu == smp_processor_id()) { 68 if (cpu == smp_processor_id()) {
71 cpuid(reg, &data[0], &data[1], &data[2], &data[3]); 69 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
72 } else { 70 } else {
73 cmd.cpu = cpu;
74 cmd.reg = reg; 71 cmd.reg = reg;
75 cmd.data = data; 72 cmd.data = data;
76 73
77 smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); 74 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
78 } 75 }
79 preempt_enable(); 76 preempt_enable();
80} 77}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index f391abcf7da9..70f39560846a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/page.h> 15#include <asm/page.h>
16#include <asm/e820.h> 16#include <asm/e820.h>
17#include <asm/setup.h>
17 18
18#ifdef CONFIG_EFI 19#ifdef CONFIG_EFI
19int efi_enabled = 0; 20int efi_enabled = 0;
@@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { {
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO 157 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} }; 158} };
158 159
159static int romsignature(const unsigned char *x) 160#define ROMSIGNATURE 0xaa55
161
162static int __init romsignature(const unsigned char *rom)
160{ 163{
161 unsigned short sig; 164 unsigned short sig;
162 int ret = 0; 165
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0) 166 return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
164 ret = (sig == 0xaa55); 167 sig == ROMSIGNATURE;
165 return ret;
166} 168}
167 169
168static int __init romchecksum(unsigned char *rom, unsigned long length) 170static int __init romchecksum(unsigned char *rom, unsigned long length)
169{ 171{
170 unsigned char *p, sum = 0; 172 unsigned char sum;
171 173
172 for (p = rom; p < rom + length; p++) 174 for (sum = 0; length; length--)
173 sum += *p; 175 sum += *rom++;
174 return sum == 0; 176 return sum == 0;
175} 177}
176 178
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683fc63a..18bddcb8e9e8 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - %gs 33 * 24(%esp) - %fs
34 * 28(%esp) - orig_eax 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %eip 35 * 2C(%esp) - %eip
36 * 30(%esp) - %cs 36 * 30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK = 0x00020000
99 99
100#define SAVE_ALL \ 100#define SAVE_ALL \
101 cld; \ 101 cld; \
102 pushl %gs; \ 102 pushl %fs; \
103 CFI_ADJUST_CFA_OFFSET 4;\ 103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\ 104 /*CFI_REL_OFFSET fs, 0;*/\
105 pushl %es; \ 105 pushl %es; \
106 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
107 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK = 0x00020000
133 movl %edx, %ds; \ 133 movl %edx, %ds; \
134 movl %edx, %es; \ 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \ 135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs 136 movl %edx, %fs
137 137
138#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
139 popl %ebx; \ 139 popl %ebx; \
@@ -166,9 +166,9 @@ VM_MASK = 0x00020000
1662: popl %es; \ 1662: popl %es; \
167 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
168 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
1693: popl %gs; \ 1693: popl %fs; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 170 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE gs;*/\ 171 /*CFI_RESTORE fs;*/\
172.pushsection .fixup,"ax"; \ 172.pushsection .fixup,"ax"; \
1734: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \ 174 jmp 1b; \
@@ -227,6 +227,7 @@ ENTRY(ret_from_fork)
227 CFI_ADJUST_CFA_OFFSET -4 227 CFI_ADJUST_CFA_OFFSET -4
228 jmp syscall_exit 228 jmp syscall_exit
229 CFI_ENDPROC 229 CFI_ENDPROC
230END(ret_from_fork)
230 231
231/* 232/*
232 * Return to user mode is not as complex as all this looks, 233 * Return to user mode is not as complex as all this looks,
@@ -258,6 +259,7 @@ ENTRY(resume_userspace)
258 # int/exception return? 259 # int/exception return?
259 jne work_pending 260 jne work_pending
260 jmp restore_all 261 jmp restore_all
262END(ret_from_exception)
261 263
262#ifdef CONFIG_PREEMPT 264#ifdef CONFIG_PREEMPT
263ENTRY(resume_kernel) 265ENTRY(resume_kernel)
@@ -272,6 +274,7 @@ need_resched:
272 jz restore_all 274 jz restore_all
273 call preempt_schedule_irq 275 call preempt_schedule_irq
274 jmp need_resched 276 jmp need_resched
277END(resume_kernel)
275#endif 278#endif
276 CFI_ENDPROC 279 CFI_ENDPROC
277 280
@@ -349,16 +352,17 @@ sysenter_past_esp:
349 movl PT_OLDESP(%esp), %ecx 352 movl PT_OLDESP(%esp), %ecx
350 xorl %ebp,%ebp 353 xorl %ebp,%ebp
351 TRACE_IRQS_ON 354 TRACE_IRQS_ON
3521: mov PT_GS(%esp), %gs 3551: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 356 ENABLE_INTERRUPTS_SYSEXIT
354 CFI_ENDPROC 357 CFI_ENDPROC
355.pushsection .fixup,"ax" 358.pushsection .fixup,"ax"
3562: movl $0,PT_GS(%esp) 3592: movl $0,PT_FS(%esp)
357 jmp 1b 360 jmp 1b
358.section __ex_table,"a" 361.section __ex_table,"a"
359 .align 4 362 .align 4
360 .long 1b,2b 363 .long 1b,2b
361.popsection 364.popsection
365ENDPROC(sysenter_entry)
362 366
363 # system call handler stub 367 # system call handler stub
364ENTRY(system_call) 368ENTRY(system_call)
@@ -459,6 +463,7 @@ ldt_ss:
459 CFI_ADJUST_CFA_OFFSET -8 463 CFI_ADJUST_CFA_OFFSET -8
460 jmp restore_nocheck 464 jmp restore_nocheck
461 CFI_ENDPROC 465 CFI_ENDPROC
466ENDPROC(system_call)
462 467
463 # perform work that needs to be done immediately before resumption 468 # perform work that needs to be done immediately before resumption
464 ALIGN 469 ALIGN
@@ -504,6 +509,7 @@ work_notifysig_v86:
504 xorl %edx, %edx 509 xorl %edx, %edx
505 call do_notify_resume 510 call do_notify_resume
506 jmp resume_userspace_sig 511 jmp resume_userspace_sig
512END(work_pending)
507 513
508 # perform syscall exit tracing 514 # perform syscall exit tracing
509 ALIGN 515 ALIGN
@@ -519,6 +525,7 @@ syscall_trace_entry:
519 cmpl $(nr_syscalls), %eax 525 cmpl $(nr_syscalls), %eax
520 jnae syscall_call 526 jnae syscall_call
521 jmp syscall_exit 527 jmp syscall_exit
528END(syscall_trace_entry)
522 529
523 # perform syscall exit tracing 530 # perform syscall exit tracing
524 ALIGN 531 ALIGN
@@ -532,6 +539,7 @@ syscall_exit_work:
532 movl $1, %edx 539 movl $1, %edx
533 call do_syscall_trace 540 call do_syscall_trace
534 jmp resume_userspace 541 jmp resume_userspace
542END(syscall_exit_work)
535 CFI_ENDPROC 543 CFI_ENDPROC
536 544
537 RING0_INT_FRAME # can't unwind into user space anyway 545 RING0_INT_FRAME # can't unwind into user space anyway
@@ -542,15 +550,17 @@ syscall_fault:
542 GET_THREAD_INFO(%ebp) 550 GET_THREAD_INFO(%ebp)
543 movl $-EFAULT,PT_EAX(%esp) 551 movl $-EFAULT,PT_EAX(%esp)
544 jmp resume_userspace 552 jmp resume_userspace
553END(syscall_fault)
545 554
546syscall_badsys: 555syscall_badsys:
547 movl $-ENOSYS,PT_EAX(%esp) 556 movl $-ENOSYS,PT_EAX(%esp)
548 jmp resume_userspace 557 jmp resume_userspace
558END(syscall_badsys)
549 CFI_ENDPROC 559 CFI_ENDPROC
550 560
551#define FIXUP_ESPFIX_STACK \ 561#define FIXUP_ESPFIX_STACK \
552 /* since we are on a wrong stack, we cant make it a C code :( */ \ 562 /* since we are on a wrong stack, we cant make it a C code :( */ \
553 movl %gs:PDA_cpu, %ebx; \ 563 movl %fs:PDA_cpu, %ebx; \
554 PER_CPU(cpu_gdt_descr, %ebx); \ 564 PER_CPU(cpu_gdt_descr, %ebx); \
555 movl GDS_address(%ebx), %ebx; \ 565 movl GDS_address(%ebx), %ebx; \
556 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 566 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -581,9 +591,9 @@ syscall_badsys:
581ENTRY(interrupt) 591ENTRY(interrupt)
582.text 592.text
583 593
584vector=0
585ENTRY(irq_entries_start) 594ENTRY(irq_entries_start)
586 RING0_INT_FRAME 595 RING0_INT_FRAME
596vector=0
587.rept NR_IRQS 597.rept NR_IRQS
588 ALIGN 598 ALIGN
589 .if vector 599 .if vector
@@ -592,11 +602,16 @@ ENTRY(irq_entries_start)
5921: pushl $~(vector) 6021: pushl $~(vector)
593 CFI_ADJUST_CFA_OFFSET 4 603 CFI_ADJUST_CFA_OFFSET 4
594 jmp common_interrupt 604 jmp common_interrupt
595.data 605 .previous
596 .long 1b 606 .long 1b
597.text 607 .text
598vector=vector+1 608vector=vector+1
599.endr 609.endr
610END(irq_entries_start)
611
612.previous
613END(interrupt)
614.previous
600 615
601/* 616/*
602 * the CPU automatically disables interrupts when executing an IRQ vector, 617 * the CPU automatically disables interrupts when executing an IRQ vector,
@@ -609,6 +624,7 @@ common_interrupt:
609 movl %esp,%eax 624 movl %esp,%eax
610 call do_IRQ 625 call do_IRQ
611 jmp ret_from_intr 626 jmp ret_from_intr
627ENDPROC(common_interrupt)
612 CFI_ENDPROC 628 CFI_ENDPROC
613 629
614#define BUILD_INTERRUPT(name, nr) \ 630#define BUILD_INTERRUPT(name, nr) \
@@ -621,18 +637,24 @@ ENTRY(name) \
621 movl %esp,%eax; \ 637 movl %esp,%eax; \
622 call smp_/**/name; \ 638 call smp_/**/name; \
623 jmp ret_from_intr; \ 639 jmp ret_from_intr; \
624 CFI_ENDPROC 640 CFI_ENDPROC; \
641ENDPROC(name)
625 642
626/* The include is where all of the SMP etc. interrupts come from */ 643/* The include is where all of the SMP etc. interrupts come from */
627#include "entry_arch.h" 644#include "entry_arch.h"
628 645
646/* This alternate entry is needed because we hijack the apic LVTT */
647#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
648BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
649#endif
650
629KPROBE_ENTRY(page_fault) 651KPROBE_ENTRY(page_fault)
630 RING0_EC_FRAME 652 RING0_EC_FRAME
631 pushl $do_page_fault 653 pushl $do_page_fault
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 ALIGN 655 ALIGN
634error_code: 656error_code:
635 /* the function address is in %gs's slot on the stack */ 657 /* the function address is in %fs's slot on the stack */
636 pushl %es 658 pushl %es
637 CFI_ADJUST_CFA_OFFSET 4 659 CFI_ADJUST_CFA_OFFSET 4
638 /*CFI_REL_OFFSET es, 0*/ 660 /*CFI_REL_OFFSET es, 0*/
@@ -661,20 +683,20 @@ error_code:
661 CFI_ADJUST_CFA_OFFSET 4 683 CFI_ADJUST_CFA_OFFSET 4
662 CFI_REL_OFFSET ebx, 0 684 CFI_REL_OFFSET ebx, 0
663 cld 685 cld
664 pushl %gs 686 pushl %fs
665 CFI_ADJUST_CFA_OFFSET 4 687 CFI_ADJUST_CFA_OFFSET 4
666 /*CFI_REL_OFFSET gs, 0*/ 688 /*CFI_REL_OFFSET fs, 0*/
667 movl $(__KERNEL_PDA), %ecx 689 movl $(__KERNEL_PDA), %ecx
668 movl %ecx, %gs 690 movl %ecx, %fs
669 UNWIND_ESPFIX_STACK 691 UNWIND_ESPFIX_STACK
670 popl %ecx 692 popl %ecx
671 CFI_ADJUST_CFA_OFFSET -4 693 CFI_ADJUST_CFA_OFFSET -4
672 /*CFI_REGISTER es, ecx*/ 694 /*CFI_REGISTER es, ecx*/
673 movl PT_GS(%esp), %edi # get the function address 695 movl PT_FS(%esp), %edi # get the function address
674 movl PT_ORIG_EAX(%esp), %edx # get the error code 696 movl PT_ORIG_EAX(%esp), %edx # get the error code
675 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 697 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
676 mov %ecx, PT_GS(%esp) 698 mov %ecx, PT_FS(%esp)
677 /*CFI_REL_OFFSET gs, ES*/ 699 /*CFI_REL_OFFSET fs, ES*/
678 movl $(__USER_DS), %ecx 700 movl $(__USER_DS), %ecx
679 movl %ecx, %ds 701 movl %ecx, %ds
680 movl %ecx, %es 702 movl %ecx, %es
@@ -692,6 +714,7 @@ ENTRY(coprocessor_error)
692 CFI_ADJUST_CFA_OFFSET 4 714 CFI_ADJUST_CFA_OFFSET 4
693 jmp error_code 715 jmp error_code
694 CFI_ENDPROC 716 CFI_ENDPROC
717END(coprocessor_error)
695 718
696ENTRY(simd_coprocessor_error) 719ENTRY(simd_coprocessor_error)
697 RING0_INT_FRAME 720 RING0_INT_FRAME
@@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error)
701 CFI_ADJUST_CFA_OFFSET 4 724 CFI_ADJUST_CFA_OFFSET 4
702 jmp error_code 725 jmp error_code
703 CFI_ENDPROC 726 CFI_ENDPROC
727END(simd_coprocessor_error)
704 728
705ENTRY(device_not_available) 729ENTRY(device_not_available)
706 RING0_INT_FRAME 730 RING0_INT_FRAME
@@ -721,6 +745,7 @@ device_not_available_emulate:
721 CFI_ADJUST_CFA_OFFSET -4 745 CFI_ADJUST_CFA_OFFSET -4
722 jmp ret_from_exception 746 jmp ret_from_exception
723 CFI_ENDPROC 747 CFI_ENDPROC
748END(device_not_available)
724 749
725/* 750/*
726 * Debug traps and NMI can happen at the one SYSENTER instruction 751 * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -864,10 +889,12 @@ ENTRY(native_iret)
864 .align 4 889 .align 4
865 .long 1b,iret_exc 890 .long 1b,iret_exc
866.previous 891.previous
892END(native_iret)
867 893
868ENTRY(native_irq_enable_sysexit) 894ENTRY(native_irq_enable_sysexit)
869 sti 895 sti
870 sysexit 896 sysexit
897END(native_irq_enable_sysexit)
871#endif 898#endif
872 899
873KPROBE_ENTRY(int3) 900KPROBE_ENTRY(int3)
@@ -890,6 +917,7 @@ ENTRY(overflow)
890 CFI_ADJUST_CFA_OFFSET 4 917 CFI_ADJUST_CFA_OFFSET 4
891 jmp error_code 918 jmp error_code
892 CFI_ENDPROC 919 CFI_ENDPROC
920END(overflow)
893 921
894ENTRY(bounds) 922ENTRY(bounds)
895 RING0_INT_FRAME 923 RING0_INT_FRAME
@@ -899,6 +927,7 @@ ENTRY(bounds)
899 CFI_ADJUST_CFA_OFFSET 4 927 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 928 jmp error_code
901 CFI_ENDPROC 929 CFI_ENDPROC
930END(bounds)
902 931
903ENTRY(invalid_op) 932ENTRY(invalid_op)
904 RING0_INT_FRAME 933 RING0_INT_FRAME
@@ -908,6 +937,7 @@ ENTRY(invalid_op)
908 CFI_ADJUST_CFA_OFFSET 4 937 CFI_ADJUST_CFA_OFFSET 4
909 jmp error_code 938 jmp error_code
910 CFI_ENDPROC 939 CFI_ENDPROC
940END(invalid_op)
911 941
912ENTRY(coprocessor_segment_overrun) 942ENTRY(coprocessor_segment_overrun)
913 RING0_INT_FRAME 943 RING0_INT_FRAME
@@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun)
917 CFI_ADJUST_CFA_OFFSET 4 947 CFI_ADJUST_CFA_OFFSET 4
918 jmp error_code 948 jmp error_code
919 CFI_ENDPROC 949 CFI_ENDPROC
950END(coprocessor_segment_overrun)
920 951
921ENTRY(invalid_TSS) 952ENTRY(invalid_TSS)
922 RING0_EC_FRAME 953 RING0_EC_FRAME
@@ -924,6 +955,7 @@ ENTRY(invalid_TSS)
924 CFI_ADJUST_CFA_OFFSET 4 955 CFI_ADJUST_CFA_OFFSET 4
925 jmp error_code 956 jmp error_code
926 CFI_ENDPROC 957 CFI_ENDPROC
958END(invalid_TSS)
927 959
928ENTRY(segment_not_present) 960ENTRY(segment_not_present)
929 RING0_EC_FRAME 961 RING0_EC_FRAME
@@ -931,6 +963,7 @@ ENTRY(segment_not_present)
931 CFI_ADJUST_CFA_OFFSET 4 963 CFI_ADJUST_CFA_OFFSET 4
932 jmp error_code 964 jmp error_code
933 CFI_ENDPROC 965 CFI_ENDPROC
966END(segment_not_present)
934 967
935ENTRY(stack_segment) 968ENTRY(stack_segment)
936 RING0_EC_FRAME 969 RING0_EC_FRAME
@@ -938,6 +971,7 @@ ENTRY(stack_segment)
938 CFI_ADJUST_CFA_OFFSET 4 971 CFI_ADJUST_CFA_OFFSET 4
939 jmp error_code 972 jmp error_code
940 CFI_ENDPROC 973 CFI_ENDPROC
974END(stack_segment)
941 975
942KPROBE_ENTRY(general_protection) 976KPROBE_ENTRY(general_protection)
943 RING0_EC_FRAME 977 RING0_EC_FRAME
@@ -953,6 +987,7 @@ ENTRY(alignment_check)
953 CFI_ADJUST_CFA_OFFSET 4 987 CFI_ADJUST_CFA_OFFSET 4
954 jmp error_code 988 jmp error_code
955 CFI_ENDPROC 989 CFI_ENDPROC
990END(alignment_check)
956 991
957ENTRY(divide_error) 992ENTRY(divide_error)
958 RING0_INT_FRAME 993 RING0_INT_FRAME
@@ -962,6 +997,7 @@ ENTRY(divide_error)
962 CFI_ADJUST_CFA_OFFSET 4 997 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 998 jmp error_code
964 CFI_ENDPROC 999 CFI_ENDPROC
1000END(divide_error)
965 1001
966#ifdef CONFIG_X86_MCE 1002#ifdef CONFIG_X86_MCE
967ENTRY(machine_check) 1003ENTRY(machine_check)
@@ -972,6 +1008,7 @@ ENTRY(machine_check)
972 CFI_ADJUST_CFA_OFFSET 4 1008 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 1009 jmp error_code
974 CFI_ENDPROC 1010 CFI_ENDPROC
1011END(machine_check)
975#endif 1012#endif
976 1013
977ENTRY(spurious_interrupt_bug) 1014ENTRY(spurious_interrupt_bug)
@@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug)
982 CFI_ADJUST_CFA_OFFSET 4 1019 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 1020 jmp error_code
984 CFI_ENDPROC 1021 CFI_ENDPROC
1022END(spurious_interrupt_bug)
985 1023
986ENTRY(kernel_thread_helper) 1024ENTRY(kernel_thread_helper)
987 pushl $0 # fake return address for unwinder 1025 pushl $0 # fake return address for unwinder
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index cb9abdfced9b..3fa7f9389afe 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -53,6 +53,7 @@
53 * any particular GDT layout, because we load our own as soon as we 53 * any particular GDT layout, because we load our own as soon as we
54 * can. 54 * can.
55 */ 55 */
56.section .text.head,"ax",@progbits
56ENTRY(startup_32) 57ENTRY(startup_32)
57 58
58#ifdef CONFIG_PARAVIRT 59#ifdef CONFIG_PARAVIRT
@@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
141 jb 10b 142 jb 10b
142 movl %edi,(init_pg_tables_end - __PAGE_OFFSET) 143 movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
143 144
144#ifdef CONFIG_SMP
145 xorl %ebx,%ebx /* This is the boot CPU (BSP) */ 145 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
146 jmp 3f 146 jmp 3f
147
148/* 147/*
149 * Non-boot CPU entry point; entered from trampoline.S 148 * Non-boot CPU entry point; entered from trampoline.S
150 * We can't lgdt here, because lgdt itself uses a data segment, but 149 * We can't lgdt here, because lgdt itself uses a data segment, but
151 * we know the trampoline has already loaded the boot_gdt_table GDT 150 * we know the trampoline has already loaded the boot_gdt_table GDT
152 * for us. 151 * for us.
152 *
153 * If cpu hotplug is not supported then this code can go in init section
154 * which will be freed later
153 */ 155 */
156
157#ifdef CONFIG_HOTPLUG_CPU
158.section .text,"ax",@progbits
159#else
160.section .init.text,"ax",@progbits
161#endif
162
163#ifdef CONFIG_SMP
154ENTRY(startup_32_smp) 164ENTRY(startup_32_smp)
155 cld 165 cld
156 movl $(__BOOT_DS),%eax 166 movl $(__BOOT_DS),%eax
@@ -208,8 +218,8 @@ ENTRY(startup_32_smp)
208 xorl %ebx,%ebx 218 xorl %ebx,%ebx
209 incl %ebx 219 incl %ebx
210 220
2113:
212#endif /* CONFIG_SMP */ 221#endif /* CONFIG_SMP */
2223:
213 223
214/* 224/*
215 * Enable paging 225 * Enable paging
@@ -309,7 +319,7 @@ is386: movl $2,%ecx # set MP
309 319
310 call check_x87 320 call check_x87
311 call setup_pda 321 call setup_pda
312 lgdt cpu_gdt_descr 322 lgdt early_gdt_descr
313 lidt idt_descr 323 lidt idt_descr
314 ljmp $(__KERNEL_CS),$1f 324 ljmp $(__KERNEL_CS),$1f
3151: movl $(__KERNEL_DS),%eax # reload all the segment registers 3251: movl $(__KERNEL_DS),%eax # reload all the segment registers
@@ -319,12 +329,12 @@ is386: movl $2,%ecx # set MP
319 movl %eax,%ds 329 movl %eax,%ds
320 movl %eax,%es 330 movl %eax,%es
321 331
322 xorl %eax,%eax # Clear FS and LDT 332 xorl %eax,%eax # Clear GS and LDT
323 movl %eax,%fs 333 movl %eax,%gs
324 lldt %ax 334 lldt %ax
325 335
326 movl $(__KERNEL_PDA),%eax 336 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs 337 mov %eax,%fs
328 338
329 cld # gcc2 wants the direction flag cleared at all times 339 cld # gcc2 wants the direction flag cleared at all times
330 pushl $0 # fake return address for unwinder 340 pushl $0 # fake return address for unwinder
@@ -360,12 +370,12 @@ check_x87:
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be 370 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA. 371 * that CPU's GDT and PDA.
362 */ 372 */
363setup_pda: 373ENTRY(setup_pda)
364 /* get the PDA pointer */ 374 /* get the PDA pointer */
365 movl start_pda, %eax 375 movl start_pda, %eax
366 376
367 /* slot the PDA address into the GDT */ 377 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx 378 mov early_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ 379 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax 380 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ 381 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
@@ -492,6 +502,7 @@ ignore_int:
492#endif 502#endif
493 iret 503 iret
494 504
505.section .text
495#ifdef CONFIG_PARAVIRT 506#ifdef CONFIG_PARAVIRT
496startup_paravirt: 507startup_paravirt:
497 cld 508 cld
@@ -502,10 +513,11 @@ startup_paravirt:
502 pushl %ecx 513 pushl %ecx
503 pushl %eax 514 pushl %eax
504 515
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe 516 pushl $__start_paravirtprobe
5071: 5171:
508 movl 0(%esp), %eax 518 movl 0(%esp), %eax
519 cmpl $__stop_paravirtprobe, %eax
520 je unhandled_paravirt
509 pushl (%eax) 521 pushl (%eax)
510 movl 8(%esp), %eax 522 movl 8(%esp), %eax
511 call *(%esp) 523 call *(%esp)
@@ -517,6 +529,10 @@ startup_paravirt:
517 529
518 addl $4, (%esp) 530 addl $4, (%esp)
519 jmp 1b 531 jmp 1b
532
533unhandled_paravirt:
534 /* Nothing wanted us: we're screwed. */
535 ud2
520#endif 536#endif
521 537
522/* 538/*
@@ -581,7 +597,7 @@ idt_descr:
581 597
582# boot GDT descriptor (later on used by CPU#0): 598# boot GDT descriptor (later on used by CPU#0):
583 .word 0 # 32 bit align gdt_desc.address 599 .word 0 # 32 bit align gdt_desc.address
584ENTRY(cpu_gdt_descr) 600ENTRY(early_gdt_descr)
585 .word GDT_ENTRIES*8-1 601 .word GDT_ENTRIES*8-1
586 .long cpu_gdt_table 602 .long cpu_gdt_table
587 603
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index ba8d302a0b72..e30ccedad0b9 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1920,7 +1920,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
1920static void __init setup_ioapic_ids_from_mpc(void) { } 1920static void __init setup_ioapic_ids_from_mpc(void) { }
1921#endif 1921#endif
1922 1922
1923static int no_timer_check __initdata; 1923int no_timer_check __initdata;
1924 1924
1925static int __init notimercheck(char *s) 1925static int __init notimercheck(char *s)
1926{ 1926{
@@ -2310,7 +2310,7 @@ static inline void __init check_timer(void)
2310 2310
2311 disable_8259A_irq(0); 2311 disable_8259A_irq(0);
2312 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, 2312 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2313 "fasteio"); 2313 "fasteoi");
2314 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2314 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2315 enable_8259A_irq(0); 2315 enable_8259A_irq(0);
2316 2316
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3201d421090a..5785d84103a6 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -19,6 +19,8 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21 21
22#include <asm/idle.h>
23
22DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; 24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
23EXPORT_PER_CPU_SYMBOL(irq_stat); 25EXPORT_PER_CPU_SYMBOL(irq_stat);
24 26
@@ -61,6 +63,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
61 union irq_ctx *curctx, *irqctx; 63 union irq_ctx *curctx, *irqctx;
62 u32 *isp; 64 u32 *isp;
63#endif 65#endif
66 exit_idle();
64 67
65 if (unlikely((unsigned)irq >= NR_IRQS)) { 68 if (unlikely((unsigned)irq >= NR_IRQS)) {
66 printk(KERN_EMERG "%s: cannot handle IRQ %d\n", 69 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index af1d53344993..b545bc746fce 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
363 " pushf\n" 363 " pushf\n"
364 /* skip cs, eip, orig_eax */ 364 /* skip cs, eip, orig_eax */
365 " subl $12, %esp\n" 365 " subl $12, %esp\n"
366 " pushl %gs\n" 366 " pushl %fs\n"
367 " pushl %ds\n" 367 " pushl %ds\n"
368 " pushl %es\n" 368 " pushl %es\n"
369 " pushl %eax\n" 369 " pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
387 " popl %edi\n" 387 " popl %edi\n"
388 " popl %ebp\n" 388 " popl %ebp\n"
389 " popl %eax\n" 389 " popl %eax\n"
390 /* skip eip, orig_eax, es, ds, gs */ 390 /* skip eip, orig_eax, es, ds, fs */
391 " addl $20, %esp\n" 391 " addl $20, %esp\n"
392 " popf\n" 392 " popf\n"
393 " ret\n"); 393 " ret\n");
@@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
408 spin_lock_irqsave(&kretprobe_lock, flags); 408 spin_lock_irqsave(&kretprobe_lock, flags);
409 head = kretprobe_inst_table_head(current); 409 head = kretprobe_inst_table_head(current);
410 /* fixup registers */ 410 /* fixup registers */
411 regs->xcs = __KERNEL_CS; 411 regs->xcs = __KERNEL_CS | get_kernel_rpl();
412 regs->eip = trampoline_address; 412 regs->eip = trampoline_address;
413 regs->orig_eax = 0xffffffff; 413 regs->orig_eax = 0xffffffff;
414 414
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 381252bae3d8..b8f16633a6ec 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -384,7 +384,7 @@ static int do_microcode_update (void)
384{ 384{
385 long cursor = 0; 385 long cursor = 0;
386 int error = 0; 386 int error = 0;
387 void *new_mc; 387 void *new_mc = NULL;
388 int cpu; 388 int cpu;
389 cpumask_t old; 389 cpumask_t old;
390 390
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 4e14264f392a..bcaa6e9b6197 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
68#ifdef CONFIG_SMP 68#ifdef CONFIG_SMP
69 69
70struct msr_command { 70struct msr_command {
71 int cpu;
72 int err; 71 int err;
73 u32 reg; 72 u32 reg;
74 u32 data[2]; 73 u32 data[2];
@@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block)
78{ 77{
79 struct msr_command *cmd = (struct msr_command *)cmd_block; 78 struct msr_command *cmd = (struct msr_command *)cmd_block;
80 79
81 if (cmd->cpu == smp_processor_id()) 80 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
82 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
83} 81}
84 82
85static void msr_smp_rdmsr(void *cmd_block) 83static void msr_smp_rdmsr(void *cmd_block)
86{ 84{
87 struct msr_command *cmd = (struct msr_command *)cmd_block; 85 struct msr_command *cmd = (struct msr_command *)cmd_block;
88 86
89 if (cmd->cpu == smp_processor_id()) 87 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
90 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
91} 88}
92 89
93static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) 90static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
@@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
99 if (cpu == smp_processor_id()) { 96 if (cpu == smp_processor_id()) {
100 ret = wrmsr_eio(reg, eax, edx); 97 ret = wrmsr_eio(reg, eax, edx);
101 } else { 98 } else {
102 cmd.cpu = cpu;
103 cmd.reg = reg; 99 cmd.reg = reg;
104 cmd.data[0] = eax; 100 cmd.data[0] = eax;
105 cmd.data[1] = edx; 101 cmd.data[1] = edx;
106 102
107 smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); 103 smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1);
108 ret = cmd.err; 104 ret = cmd.err;
109 } 105 }
110 preempt_enable(); 106 preempt_enable();
@@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
120 if (cpu == smp_processor_id()) { 116 if (cpu == smp_processor_id()) {
121 ret = rdmsr_eio(reg, eax, edx); 117 ret = rdmsr_eio(reg, eax, edx);
122 } else { 118 } else {
123 cmd.cpu = cpu;
124 cmd.reg = reg; 119 cmd.reg = reg;
125 120
126 smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); 121 smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1);
127 122
128 *eax = cmd.data[0]; 123 *eax = cmd.data[0];
129 *edx = cmd.data[1]; 124 *edx = cmd.data[1];
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 1a6f8bb8881c..5d8a07c20281 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -185,7 +185,8 @@ static __cpuinit inline int nmi_known_cpu(void)
185{ 185{
186 switch (boot_cpu_data.x86_vendor) { 186 switch (boot_cpu_data.x86_vendor) {
187 case X86_VENDOR_AMD: 187 case X86_VENDOR_AMD:
188 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); 188 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
189 || (boot_cpu_data.x86 == 16));
189 case X86_VENDOR_INTEL: 190 case X86_VENDOR_INTEL:
190 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 191 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
191 return 1; 192 return 1;
@@ -216,6 +217,28 @@ static __init void nmi_cpu_busy(void *data)
216} 217}
217#endif 218#endif
218 219
220static unsigned int adjust_for_32bit_ctr(unsigned int hz)
221{
222 u64 counter_val;
223 unsigned int retval = hz;
224
225 /*
226 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
227 * are writable, with higher bits sign extending from bit 31.
228 * So, we can only program the counter with 31 bit values and
229 * 32nd bit should be 1, for 33.. to be 1.
230 * Find the appropriate nmi_hz
231 */
232 counter_val = (u64)cpu_khz * 1000;
233 do_div(counter_val, retval);
234 if (counter_val > 0x7fffffffULL) {
235 u64 count = (u64)cpu_khz * 1000;
236 do_div(count, 0x7fffffffUL);
237 retval = count + 1;
238 }
239 return retval;
240}
241
219static int __init check_nmi_watchdog(void) 242static int __init check_nmi_watchdog(void)
220{ 243{
221 unsigned int *prev_nmi_count; 244 unsigned int *prev_nmi_count;
@@ -281,18 +304,10 @@ static int __init check_nmi_watchdog(void)
281 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 304 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
282 305
283 nmi_hz = 1; 306 nmi_hz = 1;
284 /* 307
285 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter 308 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
286 * are writable, with higher bits sign extending from bit 31. 309 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
287 * So, we can only program the counter with 31 bit values and 310 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
288 * 32nd bit should be 1, for 33.. to be 1.
289 * Find the appropriate nmi_hz
290 */
291 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
292 ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
293 u64 count = (u64)cpu_khz * 1000;
294 do_div(count, 0x7fffffffUL);
295 nmi_hz = count + 1;
296 } 311 }
297 } 312 }
298 313
@@ -369,6 +384,34 @@ void enable_timer_nmi_watchdog(void)
369 } 384 }
370} 385}
371 386
387static void __acpi_nmi_disable(void *__unused)
388{
389 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
390}
391
392/*
393 * Disable timer based NMIs on all CPUs:
394 */
395void acpi_nmi_disable(void)
396{
397 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
398 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
399}
400
401static void __acpi_nmi_enable(void *__unused)
402{
403 apic_write_around(APIC_LVT0, APIC_DM_NMI);
404}
405
406/*
407 * Enable timer based NMIs on all CPUs:
408 */
409void acpi_nmi_enable(void)
410{
411 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
412 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
413}
414
372#ifdef CONFIG_PM 415#ifdef CONFIG_PM
373 416
374static int nmi_pm_active; /* nmi_active before suspend */ 417static int nmi_pm_active; /* nmi_active before suspend */
@@ -442,6 +485,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
442 wrmsrl(perfctr_msr, 0 - count); 485 wrmsrl(perfctr_msr, 0 - count);
443} 486}
444 487
488static void write_watchdog_counter32(unsigned int perfctr_msr,
489 const char *descr)
490{
491 u64 count = (u64)cpu_khz * 1000;
492
493 do_div(count, nmi_hz);
494 if(descr)
495 Dprintk("setting %s to -0x%08Lx\n", descr, count);
496 wrmsr(perfctr_msr, (u32)(-count), 0);
497}
498
445/* Note that these events don't tick when the CPU idles. This means 499/* Note that these events don't tick when the CPU idles. This means
446 the frequency varies with CPU load. */ 500 the frequency varies with CPU load. */
447 501
@@ -531,7 +585,8 @@ static int setup_p6_watchdog(void)
531 585
532 /* setup the timer */ 586 /* setup the timer */
533 wrmsr(evntsel_msr, evntsel, 0); 587 wrmsr(evntsel_msr, evntsel, 0);
534 write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); 588 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
589 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
535 apic_write(APIC_LVTPC, APIC_DM_NMI); 590 apic_write(APIC_LVTPC, APIC_DM_NMI);
536 evntsel |= P6_EVNTSEL0_ENABLE; 591 evntsel |= P6_EVNTSEL0_ENABLE;
537 wrmsr(evntsel_msr, evntsel, 0); 592 wrmsr(evntsel_msr, evntsel, 0);
@@ -704,7 +759,8 @@ static int setup_intel_arch_watchdog(void)
704 759
705 /* setup the timer */ 760 /* setup the timer */
706 wrmsr(evntsel_msr, evntsel, 0); 761 wrmsr(evntsel_msr, evntsel, 0);
707 write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); 762 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
763 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
708 apic_write(APIC_LVTPC, APIC_DM_NMI); 764 apic_write(APIC_LVTPC, APIC_DM_NMI);
709 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 765 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
710 wrmsr(evntsel_msr, evntsel, 0); 766 wrmsr(evntsel_msr, evntsel, 0);
@@ -762,7 +818,8 @@ void setup_apic_nmi_watchdog (void *unused)
762 if (nmi_watchdog == NMI_LOCAL_APIC) { 818 if (nmi_watchdog == NMI_LOCAL_APIC) {
763 switch (boot_cpu_data.x86_vendor) { 819 switch (boot_cpu_data.x86_vendor) {
764 case X86_VENDOR_AMD: 820 case X86_VENDOR_AMD:
765 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) 821 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
822 boot_cpu_data.x86 != 16)
766 return; 823 return;
767 if (!setup_k7_watchdog()) 824 if (!setup_k7_watchdog())
768 return; 825 return;
@@ -956,6 +1013,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
956 dummy &= ~P4_CCCR_OVF; 1013 dummy &= ~P4_CCCR_OVF;
957 wrmsrl(wd->cccr_msr, dummy); 1014 wrmsrl(wd->cccr_msr, dummy);
958 apic_write(APIC_LVTPC, APIC_DM_NMI); 1015 apic_write(APIC_LVTPC, APIC_DM_NMI);
1016 /* start the cycle over again */
1017 write_watchdog_counter(wd->perfctr_msr, NULL);
959 } 1018 }
960 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || 1019 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
961 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { 1020 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
@@ -964,9 +1023,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
964 * other P6 variant. 1023 * other P6 variant.
965 * ArchPerfom/Core Duo also needs this */ 1024 * ArchPerfom/Core Duo also needs this */
966 apic_write(APIC_LVTPC, APIC_DM_NMI); 1025 apic_write(APIC_LVTPC, APIC_DM_NMI);
1026 /* P6/ARCH_PERFMON has 32 bit counter write */
1027 write_watchdog_counter32(wd->perfctr_msr, NULL);
1028 } else {
1029 /* start the cycle over again */
1030 write_watchdog_counter(wd->perfctr_msr, NULL);
967 } 1031 }
968 /* start the cycle over again */
969 write_watchdog_counter(wd->perfctr_msr, NULL);
970 rc = 1; 1032 rc = 1;
971 } else if (nmi_watchdog == NMI_IO_APIC) { 1033 } else if (nmi_watchdog == NMI_IO_APIC) {
972 /* don't know how to accurately check for this. 1034 /* don't know how to accurately check for this.
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index e55fd05da0f5..c156ecfa3872 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
92 return insn_len; 92 return insn_len;
93} 93}
94 94
95static fastcall unsigned long native_get_debugreg(int regno) 95static unsigned long native_get_debugreg(int regno)
96{ 96{
97 unsigned long val = 0; /* Damn you, gcc! */ 97 unsigned long val = 0; /* Damn you, gcc! */
98 98
@@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno)
115 return val; 115 return val;
116} 116}
117 117
118static fastcall void native_set_debugreg(int regno, unsigned long value) 118static void native_set_debugreg(int regno, unsigned long value)
119{ 119{
120 switch (regno) { 120 switch (regno) {
121 case 0: 121 case 0:
@@ -146,55 +146,55 @@ void init_IRQ(void)
146 paravirt_ops.init_IRQ(); 146 paravirt_ops.init_IRQ();
147} 147}
148 148
149static fastcall void native_clts(void) 149static void native_clts(void)
150{ 150{
151 asm volatile ("clts"); 151 asm volatile ("clts");
152} 152}
153 153
154static fastcall unsigned long native_read_cr0(void) 154static unsigned long native_read_cr0(void)
155{ 155{
156 unsigned long val; 156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); 157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val; 158 return val;
159} 159}
160 160
161static fastcall void native_write_cr0(unsigned long val) 161static void native_write_cr0(unsigned long val)
162{ 162{
163 asm volatile("movl %0,%%cr0": :"r" (val)); 163 asm volatile("movl %0,%%cr0": :"r" (val));
164} 164}
165 165
166static fastcall unsigned long native_read_cr2(void) 166static unsigned long native_read_cr2(void)
167{ 167{
168 unsigned long val; 168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); 169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val; 170 return val;
171} 171}
172 172
173static fastcall void native_write_cr2(unsigned long val) 173static void native_write_cr2(unsigned long val)
174{ 174{
175 asm volatile("movl %0,%%cr2": :"r" (val)); 175 asm volatile("movl %0,%%cr2": :"r" (val));
176} 176}
177 177
178static fastcall unsigned long native_read_cr3(void) 178static unsigned long native_read_cr3(void)
179{ 179{
180 unsigned long val; 180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); 181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val; 182 return val;
183} 183}
184 184
185static fastcall void native_write_cr3(unsigned long val) 185static void native_write_cr3(unsigned long val)
186{ 186{
187 asm volatile("movl %0,%%cr3": :"r" (val)); 187 asm volatile("movl %0,%%cr3": :"r" (val));
188} 188}
189 189
190static fastcall unsigned long native_read_cr4(void) 190static unsigned long native_read_cr4(void)
191{ 191{
192 unsigned long val; 192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); 193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val; 194 return val;
195} 195}
196 196
197static fastcall unsigned long native_read_cr4_safe(void) 197static unsigned long native_read_cr4_safe(void)
198{ 198{
199 unsigned long val; 199 unsigned long val;
200 /* This could fault if %cr4 does not exist */ 200 /* This could fault if %cr4 does not exist */
@@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void)
207 return val; 207 return val;
208} 208}
209 209
210static fastcall void native_write_cr4(unsigned long val) 210static void native_write_cr4(unsigned long val)
211{ 211{
212 asm volatile("movl %0,%%cr4": :"r" (val)); 212 asm volatile("movl %0,%%cr4": :"r" (val));
213} 213}
214 214
215static fastcall unsigned long native_save_fl(void) 215static unsigned long native_save_fl(void)
216{ 216{
217 unsigned long f; 217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); 218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f; 219 return f;
220} 220}
221 221
222static fastcall void native_restore_fl(unsigned long f) 222static void native_restore_fl(unsigned long f)
223{ 223{
224 asm volatile("pushl %0 ; popfl": /* no output */ 224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f) 225 :"g" (f)
226 :"memory", "cc"); 226 :"memory", "cc");
227} 227}
228 228
229static fastcall void native_irq_disable(void) 229static void native_irq_disable(void)
230{ 230{
231 asm volatile("cli": : :"memory"); 231 asm volatile("cli": : :"memory");
232} 232}
233 233
234static fastcall void native_irq_enable(void) 234static void native_irq_enable(void)
235{ 235{
236 asm volatile("sti": : :"memory"); 236 asm volatile("sti": : :"memory");
237} 237}
238 238
239static fastcall void native_safe_halt(void) 239static void native_safe_halt(void)
240{ 240{
241 asm volatile("sti; hlt": : :"memory"); 241 asm volatile("sti; hlt": : :"memory");
242} 242}
243 243
244static fastcall void native_halt(void) 244static void native_halt(void)
245{ 245{
246 asm volatile("hlt": : :"memory"); 246 asm volatile("hlt": : :"memory");
247} 247}
248 248
249static fastcall void native_wbinvd(void) 249static void native_wbinvd(void)
250{ 250{
251 asm volatile("wbinvd": : :"memory"); 251 asm volatile("wbinvd": : :"memory");
252} 252}
253 253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) 254static unsigned long long native_read_msr(unsigned int msr, int *err)
255{ 255{
256 unsigned long long val; 256 unsigned long long val;
257 257
@@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
270 return val; 270 return val;
271} 271}
272 272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val) 273static int native_write_msr(unsigned int msr, unsigned long long val)
274{ 274{
275 int err; 275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n" 276 asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
288 return err; 288 return err;
289} 289}
290 290
291static fastcall unsigned long long native_read_tsc(void) 291static unsigned long long native_read_tsc(void)
292{ 292{
293 unsigned long long val; 293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val)); 294 asm volatile("rdtsc" : "=A" (val));
295 return val; 295 return val;
296} 296}
297 297
298static fastcall unsigned long long native_read_pmc(void) 298static unsigned long long native_read_pmc(void)
299{ 299{
300 unsigned long long val; 300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val)); 301 asm volatile("rdpmc" : "=A" (val));
302 return val; 302 return val;
303} 303}
304 304
305static fastcall void native_load_tr_desc(void) 305static void native_load_tr_desc(void)
306{ 306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); 307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308} 308}
309 309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) 310static void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{ 311{
312 asm volatile("lgdt %0"::"m" (*dtr)); 312 asm volatile("lgdt %0"::"m" (*dtr));
313} 313}
314 314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) 315static void native_load_idt(const struct Xgt_desc_struct *dtr)
316{ 316{
317 asm volatile("lidt %0"::"m" (*dtr)); 317 asm volatile("lidt %0"::"m" (*dtr));
318} 318}
319 319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) 320static void native_store_gdt(struct Xgt_desc_struct *dtr)
321{ 321{
322 asm ("sgdt %0":"=m" (*dtr)); 322 asm ("sgdt %0":"=m" (*dtr));
323} 323}
324 324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) 325static void native_store_idt(struct Xgt_desc_struct *dtr)
326{ 326{
327 asm ("sidt %0":"=m" (*dtr)); 327 asm ("sidt %0":"=m" (*dtr));
328} 328}
329 329
330static fastcall unsigned long native_store_tr(void) 330static unsigned long native_store_tr(void)
331{ 331{
332 unsigned long tr; 332 unsigned long tr;
333 asm ("str %0":"=r" (tr)); 333 asm ("str %0":"=r" (tr));
334 return tr; 334 return tr;
335} 335}
336 336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) 337static void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{ 338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] 339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2); 340 C(0); C(1); C(2);
@@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32
348 lp[1] = entry_high; 348 lp[1] = entry_high;
349} 349}
350 350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) 351static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{ 352{
353 native_write_dt_entry(dt, entrynum, low, high); 353 native_write_dt_entry(dt, entrynum, low, high);
354} 354}
355 355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) 356static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{ 357{
358 native_write_dt_entry(dt, entrynum, low, high); 358 native_write_dt_entry(dt, entrynum, low, high);
359} 359}
360 360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) 361static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{ 362{
363 native_write_dt_entry(dt, entrynum, low, high); 363 native_write_dt_entry(dt, entrynum, low, high);
364} 364}
365 365
366static fastcall void native_load_esp0(struct tss_struct *tss, 366static void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread) 367 struct thread_struct *thread)
368{ 368{
369 tss->esp0 = thread->esp0; 369 tss->esp0 = thread->esp0;
@@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss,
375 } 375 }
376} 376}
377 377
378static fastcall void native_io_delay(void) 378static void native_io_delay(void)
379{ 379{
380 asm volatile("outb %al,$0x80"); 380 asm volatile("outb %al,$0x80");
381} 381}
382 382
383static fastcall void native_flush_tlb(void) 383static void native_flush_tlb(void)
384{ 384{
385 __native_flush_tlb(); 385 __native_flush_tlb();
386} 386}
@@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void)
389 * Global pages have to be flushed a bit differently. Not a real 389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often. 390 * performance problem because this does not happen often.
391 */ 391 */
392static fastcall void native_flush_tlb_global(void) 392static void native_flush_tlb_global(void)
393{ 393{
394 __native_flush_tlb_global(); 394 __native_flush_tlb_global();
395} 395}
396 396
397static fastcall void native_flush_tlb_single(u32 addr) 397static void native_flush_tlb_single(u32 addr)
398{ 398{
399 __native_flush_tlb_single(addr); 399 __native_flush_tlb_single(addr);
400} 400}
401 401
402#ifndef CONFIG_X86_PAE 402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) 403static void native_set_pte(pte_t *ptep, pte_t pteval)
404{ 404{
405 *ptep = pteval; 405 *ptep = pteval;
406} 406}
407 407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) 408static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{ 409{
410 *ptep = pteval; 410 *ptep = pteval;
411} 411}
412 412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) 413static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{ 414{
415 *pmdp = pmdval; 415 *pmdp = pmdval;
416} 416}
417 417
418#else /* CONFIG_X86_PAE */ 418#else /* CONFIG_X86_PAE */
419 419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte) 420static void native_set_pte(pte_t *ptep, pte_t pte)
421{ 421{
422 ptep->pte_high = pte.pte_high; 422 ptep->pte_high = pte.pte_high;
423 smp_wmb(); 423 smp_wmb();
424 ptep->pte_low = pte.pte_low; 424 ptep->pte_low = pte.pte_low;
425} 425}
426 426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) 427static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{ 428{
429 ptep->pte_high = pte.pte_high; 429 ptep->pte_high = pte.pte_high;
430 smp_wmb(); 430 smp_wmb();
431 ptep->pte_low = pte.pte_low; 431 ptep->pte_low = pte.pte_low;
432} 432}
433 433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 434static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{ 435{
436 ptep->pte_low = 0; 436 ptep->pte_low = 0;
437 smp_wmb(); 437 smp_wmb();
@@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long
440 ptep->pte_low = pte.pte_low; 440 ptep->pte_low = pte.pte_low;
441} 441}
442 442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) 443static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{ 444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval)); 445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446} 446}
447 447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) 448static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{ 449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); 450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451} 451}
452 452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) 453static void native_set_pud(pud_t *pudp, pud_t pudval)
454{ 454{
455 *pudp = pudval; 455 *pudp = pudval;
456} 456}
457 457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 458static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{ 459{
460 ptep->pte_low = 0; 460 ptep->pte_low = 0;
461 smp_wmb(); 461 smp_wmb();
462 ptep->pte_high = 0; 462 ptep->pte_high = 0;
463} 463}
464 464
465static fastcall void native_pmd_clear(pmd_t *pmd) 465static void native_pmd_clear(pmd_t *pmd)
466{ 466{
467 u32 *tmp = (u32 *)pmd; 467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0; 468 *tmp = 0;
@@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd)
472#endif /* CONFIG_X86_PAE */ 472#endif /* CONFIG_X86_PAE */
473 473
474/* These are in entry.S */ 474/* These are in entry.S */
475extern fastcall void native_iret(void); 475extern void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void); 476extern void native_irq_enable_sysexit(void);
477 477
478static int __init print_banner(void) 478static int __init print_banner(void)
479{ 479{
@@ -482,9 +482,6 @@ static int __init print_banner(void)
482} 482}
483core_initcall(print_banner); 483core_initcall(print_banner);
484 484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = { 485struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware", 486 .name = "bare hardware",
490 .paravirt_enabled = 0, 487 .paravirt_enabled = 0,
@@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = {
544 .apic_write = native_apic_write, 541 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic, 542 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read, 543 .apic_read = native_apic_read,
544 .setup_boot_clock = setup_boot_APIC_clock,
545 .setup_secondary_clock = setup_secondary_APIC_clock,
547#endif 546#endif
547 .set_lazy_mode = (void *)native_nop,
548 548
549 .flush_tlb_user = native_flush_tlb, 549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global, 550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single, 551 .flush_tlb_single = native_flush_tlb_single,
552 552
553 .alloc_pt = (void *)native_nop,
554 .alloc_pd = (void *)native_nop,
555 .alloc_pd_clone = (void *)native_nop,
556 .release_pt = (void *)native_nop,
557 .release_pd = (void *)native_nop,
558
553 .set_pte = native_set_pte, 559 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at, 560 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd, 561 .set_pmd = native_set_pmd,
@@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = {
565 571
566 .irq_enable_sysexit = native_irq_enable_sysexit, 572 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret, 573 .iret = native_iret,
574
575 .startup_ipi_hook = (void *)native_nop,
568}; 576};
569 577
570/* 578/*
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/i386/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
1#include <linux/platform_device.h>
2#include <linux/errno.h>
3#include <linux/init.h>
4
5static __init int add_pcspkr(void)
6{
7 struct platform_device *pd;
8 int ret;
9
10 pd = platform_device_alloc("pcspkr", -1);
11 if (!pd)
12 return -ENOMEM;
13
14 ret = platform_device_add(pd);
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19}
20device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index c641056233a6..7845d480c293 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -48,6 +48,7 @@
48#include <asm/i387.h> 48#include <asm/i387.h>
49#include <asm/desc.h> 49#include <asm/desc.h>
50#include <asm/vm86.h> 50#include <asm/vm86.h>
51#include <asm/idle.h>
51#ifdef CONFIG_MATH_EMULATION 52#ifdef CONFIG_MATH_EMULATION
52#include <asm/math_emu.h> 53#include <asm/math_emu.h>
53#endif 54#endif
@@ -80,6 +81,42 @@ void (*pm_idle)(void);
80EXPORT_SYMBOL(pm_idle); 81EXPORT_SYMBOL(pm_idle);
81static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 82static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
82 83
84static ATOMIC_NOTIFIER_HEAD(idle_notifier);
85
86void idle_notifier_register(struct notifier_block *n)
87{
88 atomic_notifier_chain_register(&idle_notifier, n);
89}
90
91void idle_notifier_unregister(struct notifier_block *n)
92{
93 atomic_notifier_chain_unregister(&idle_notifier, n);
94}
95
96static DEFINE_PER_CPU(volatile unsigned long, idle_state);
97
98void enter_idle(void)
99{
100 /* needs to be atomic w.r.t. interrupts, not against other CPUs */
101 __set_bit(0, &__get_cpu_var(idle_state));
102 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
103}
104
105static void __exit_idle(void)
106{
107 /* needs to be atomic w.r.t. interrupts, not against other CPUs */
108 if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0)
109 return;
110 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
111}
112
113void exit_idle(void)
114{
115 if (current->pid)
116 return;
117 __exit_idle();
118}
119
83void disable_hlt(void) 120void disable_hlt(void)
84{ 121{
85 hlt_counter++; 122 hlt_counter++;
@@ -130,6 +167,7 @@ EXPORT_SYMBOL(default_idle);
130 */ 167 */
131static void poll_idle (void) 168static void poll_idle (void)
132{ 169{
170 local_irq_enable();
133 cpu_relax(); 171 cpu_relax();
134} 172}
135 173
@@ -189,7 +227,16 @@ void cpu_idle(void)
189 play_dead(); 227 play_dead();
190 228
191 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 229 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
230
231 /*
232 * Idle routines should keep interrupts disabled
233 * from here on, until they go to idle.
234 * Otherwise, idle callbacks can misfire.
235 */
236 local_irq_disable();
237 enter_idle();
192 idle(); 238 idle();
239 __exit_idle();
193 } 240 }
194 preempt_enable_no_resched(); 241 preempt_enable_no_resched();
195 schedule(); 242 schedule();
@@ -243,7 +290,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
243 __monitor((void *)&current_thread_info()->flags, 0, 0); 290 __monitor((void *)&current_thread_info()->flags, 0, 0);
244 smp_mb(); 291 smp_mb();
245 if (!need_resched()) 292 if (!need_resched())
246 __mwait(eax, ecx); 293 __sti_mwait(eax, ecx);
294 else
295 local_irq_enable();
296 } else {
297 local_irq_enable();
247 } 298 }
248} 299}
249 300
@@ -308,8 +359,8 @@ void show_regs(struct pt_regs * regs)
308 regs->eax,regs->ebx,regs->ecx,regs->edx); 359 regs->eax,regs->ebx,regs->ecx,regs->edx);
309 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 360 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
310 regs->esi, regs->edi, regs->ebp); 361 regs->esi, regs->edi, regs->ebp);
311 printk(" DS: %04x ES: %04x GS: %04x\n", 362 printk(" DS: %04x ES: %04x FS: %04x\n",
312 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); 363 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
313 364
314 cr0 = read_cr0(); 365 cr0 = read_cr0();
315 cr2 = read_cr2(); 366 cr2 = read_cr2();
@@ -340,7 +391,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
340 391
341 regs.xds = __USER_DS; 392 regs.xds = __USER_DS;
342 regs.xes = __USER_DS; 393 regs.xes = __USER_DS;
343 regs.xgs = __KERNEL_PDA; 394 regs.xfs = __KERNEL_PDA;
344 regs.orig_eax = -1; 395 regs.orig_eax = -1;
345 regs.eip = (unsigned long) kernel_thread_helper; 396 regs.eip = (unsigned long) kernel_thread_helper;
346 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 397 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +476,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
425 476
426 p->thread.eip = (unsigned long) ret_from_fork; 477 p->thread.eip = (unsigned long) ret_from_fork;
427 478
428 savesegment(fs,p->thread.fs); 479 savesegment(gs,p->thread.gs);
429 480
430 tsk = current; 481 tsk = current;
431 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 482 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +552,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
501 dump->regs.eax = regs->eax; 552 dump->regs.eax = regs->eax;
502 dump->regs.ds = regs->xds; 553 dump->regs.ds = regs->xds;
503 dump->regs.es = regs->xes; 554 dump->regs.es = regs->xes;
504 savesegment(fs,dump->regs.fs); 555 dump->regs.fs = regs->xfs;
505 dump->regs.gs = regs->xgs; 556 savesegment(gs,dump->regs.gs);
506 dump->regs.orig_eax = regs->orig_eax; 557 dump->regs.orig_eax = regs->orig_eax;
507 dump->regs.eip = regs->eip; 558 dump->regs.eip = regs->eip;
508 dump->regs.cs = regs->xcs; 559 dump->regs.cs = regs->xcs;
@@ -653,7 +704,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
653 load_esp0(tss, next); 704 load_esp0(tss, next);
654 705
655 /* 706 /*
656 * Save away %fs. No need to save %gs, as it was saved on the 707 * Save away %gs. No need to save %fs, as it was saved on the
657 * stack on entry. No need to save %es and %ds, as those are 708 * stack on entry. No need to save %es and %ds, as those are
658 * always kernel segments while inside the kernel. Doing this 709 * always kernel segments while inside the kernel. Doing this
659 * before setting the new TLS descriptors avoids the situation 710 * before setting the new TLS descriptors avoids the situation
@@ -662,7 +713,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
662 * used %fs or %gs (it does not today), or if the kernel is 713 * used %fs or %gs (it does not today), or if the kernel is
663 * running inside of a hypervisor layer. 714 * running inside of a hypervisor layer.
664 */ 715 */
665 savesegment(fs, prev->fs); 716 savesegment(gs, prev->gs);
666 717
667 /* 718 /*
668 * Load the per-thread Thread-Local Storage descriptor. 719 * Load the per-thread Thread-Local Storage descriptor.
@@ -670,14 +721,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
670 load_TLS(next, cpu); 721 load_TLS(next, cpu);
671 722
672 /* 723 /*
673 * Restore %fs if needed. 724 * Restore IOPL if needed. In normal use, the flags restore
674 * 725 * in the switch assembly will handle this. But if the kernel
675 * Glibc normally makes %fs be zero. 726 * is running virtualized at a non-zero CPL, the popf will
727 * not restore flags, so it must be done in a separate step.
676 */ 728 */
677 if (unlikely(prev->fs | next->fs)) 729 if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
678 loadsegment(fs, next->fs); 730 set_iopl_mask(next->iopl);
679
680 write_pda(pcurrent, next_p);
681 731
682 /* 732 /*
683 * Now maybe handle debug registers and/or IO bitmaps 733 * Now maybe handle debug registers and/or IO bitmaps
@@ -688,6 +738,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
688 738
689 disable_tsc(prev_p, next_p); 739 disable_tsc(prev_p, next_p);
690 740
741 /*
742 * Leave lazy mode, flushing any hypercalls made here.
743 * This must be done before restoring TLS segments so
744 * the GDT and LDT are properly updated, and must be
745 * done before math_state_restore, so the TS bit is up
746 * to date.
747 */
748 arch_leave_lazy_cpu_mode();
749
691 /* If the task has used fpu the last 5 timeslices, just do a full 750 /* If the task has used fpu the last 5 timeslices, just do a full
692 * restore of the math state immediately to avoid the trap; the 751 * restore of the math state immediately to avoid the trap; the
693 * chances of needing FPU soon are obviously high now 752 * chances of needing FPU soon are obviously high now
@@ -695,6 +754,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
695 if (next_p->fpu_counter > 5) 754 if (next_p->fpu_counter > 5)
696 math_state_restore(); 755 math_state_restore();
697 756
757 /*
758 * Restore %gs if needed (which is common)
759 */
760 if (prev->gs | next->gs)
761 loadsegment(gs, next->gs);
762
763 write_pda(pcurrent, next_p);
764
698 return prev_p; 765 return prev_p;
699} 766}
700 767
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index af8aabe85800..4a8f8a259723 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *child,
89 unsigned long regno, unsigned long value) 89 unsigned long regno, unsigned long value)
90{ 90{
91 switch (regno >> 2) { 91 switch (regno >> 2) {
92 case FS: 92 case GS:
93 if (value && (value & 3) != 3) 93 if (value && (value & 3) != 3)
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.gs = value;
96 return 0; 96 return 0;
97 case DS: 97 case DS:
98 case ES: 98 case ES:
99 case GS: 99 case FS:
100 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
101 return -EIO; 101 return -EIO;
102 value &= 0xffff; 102 value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *child,
112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
113 break; 113 break;
114 } 114 }
115 if (regno > ES*4) 115 if (regno > FS*4)
116 regno -= 1*4; 116 regno -= 1*4;
117 put_stack_long(child, regno, value); 117 put_stack_long(child, regno, value);
118 return 0; 118 return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child,
124 unsigned long retval = ~0UL; 124 unsigned long retval = ~0UL;
125 125
126 switch (regno >> 2) { 126 switch (regno >> 2) {
127 case FS: 127 case GS:
128 retval = child->thread.fs; 128 retval = child->thread.gs;
129 break; 129 break;
130 case DS: 130 case DS:
131 case ES: 131 case ES:
132 case GS: 132 case FS:
133 case SS: 133 case SS:
134 case CS: 134 case CS:
135 retval = 0xffff; 135 retval = 0xffff;
136 /* fall through */ 136 /* fall through */
137 default: 137 default:
138 if (regno > ES*4) 138 if (regno > FS*4)
139 regno -= 1*4; 139 regno -= 1*4;
140 retval &= get_stack_long(child, regno); 140 retval &= get_stack_long(child, regno);
141 } 141 }
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 4694ac980cd2..122623dcc6e1 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -33,7 +33,6 @@
33#include <linux/initrd.h> 33#include <linux/initrd.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/platform_device.h>
37#include <linux/console.h> 36#include <linux/console.h>
38#include <linux/mca.h> 37#include <linux/mca.h>
39#include <linux/root_dev.h> 38#include <linux/root_dev.h>
@@ -60,6 +59,7 @@
60#include <asm/io_apic.h> 59#include <asm/io_apic.h>
61#include <asm/ist.h> 60#include <asm/ist.h>
62#include <asm/io.h> 61#include <asm/io.h>
62#include <asm/vmi.h>
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
@@ -581,6 +581,14 @@ void __init setup_arch(char **cmdline_p)
581 581
582 max_low_pfn = setup_memory(); 582 max_low_pfn = setup_memory();
583 583
584#ifdef CONFIG_VMI
585 /*
586 * Must be after max_low_pfn is determined, and before kernel
587 * pagetables are setup.
588 */
589 vmi_init();
590#endif
591
584 /* 592 /*
585 * NOTE: before this point _nobody_ is allowed to allocate 593 * NOTE: before this point _nobody_ is allowed to allocate
586 * any memory using the bootmem allocator. Although the 594 * any memory using the bootmem allocator. Although the
@@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p)
651#endif 659#endif
652 tsc_init(); 660 tsc_init();
653} 661}
654
655static __init int add_pcspkr(void)
656{
657 struct platform_device *pd;
658 int ret;
659
660 pd = platform_device_alloc("pcspkr", -1);
661 if (!pd)
662 return -ENOMEM;
663
664 ret = platform_device_add(pd);
665 if (ret)
666 platform_device_put(pd);
667
668 return ret;
669}
670device_initcall(add_pcspkr);
671
672/*
673 * Local Variables:
674 * mode:c
675 * c-file-style:"k&r"
676 * c-basic-offset:8
677 * End:
678 */
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 65d7620eaa09..4f99e870c986 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/elf.h> 23#include <linux/elf.h>
24#include <linux/binfmts.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/ucontext.h> 26#include <asm/ucontext.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 129 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 130 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 131
131 COPY_SEG(gs); 132 GET_SEG(gs);
132 GET_SEG(fs); 133 COPY_SEG(fs);
133 COPY_SEG(es); 134 COPY_SEG(es);
134 COPY_SEG(ds); 135 COPY_SEG(ds);
135 COPY(edi); 136 COPY(edi);
@@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 245{
245 int tmp, err = 0; 246 int tmp, err = 0;
246 247
247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); 248 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
248 savesegment(fs, tmp); 249 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 250 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 251
251 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); 252 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
252 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); 253 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
@@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
349 goto give_sigsegv; 350 goto give_sigsegv;
350 } 351 }
351 352
352 restorer = (void *)VDSO_SYM(&__kernel_sigreturn); 353 if (current->binfmt->hasvdso)
354 restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
355 else
356 restorer = (void *)&frame->retcode;
353 if (ka->sa.sa_flags & SA_RESTORER) 357 if (ka->sa.sa_flags & SA_RESTORER)
354 restorer = ka->sa.sa_restorer; 358 restorer = ka->sa.sa_restorer;
355 359
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 5285aff8367f..9bd9637ae692 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -23,6 +23,7 @@
23 23
24#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26#include <asm/idle.h>
26#include <mach_apic.h> 27#include <mach_apic.h>
27 28
28/* 29/*
@@ -374,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
374 /* 375 /*
375 * i'm not happy about this global shared spinlock in the 376 * i'm not happy about this global shared spinlock in the
376 * MM hot path, but we'll see how contended it is. 377 * MM hot path, but we'll see how contended it is.
377 * Temporarily this turns IRQs off, so that lockups are 378 * AK: x86-64 has a faster method that could be ported.
378 * detected by the NMI watchdog.
379 */ 379 */
380 spin_lock(&tlbstate_lock); 380 spin_lock(&tlbstate_lock);
381 381
@@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
400 400
401 while (!cpus_empty(flush_cpumask)) 401 while (!cpus_empty(flush_cpumask))
402 /* nothing. lockup detection does not belong here */ 402 /* nothing. lockup detection does not belong here */
403 mb(); 403 cpu_relax();
404 404
405 flush_mm = NULL; 405 flush_mm = NULL;
406 flush_va = 0; 406 flush_va = 0;
@@ -624,6 +624,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
624 /* 624 /*
625 * At this point the info structure may be out of scope unless wait==1 625 * At this point the info structure may be out of scope unless wait==1
626 */ 626 */
627 exit_idle();
627 irq_enter(); 628 irq_enter();
628 (*func)(info); 629 (*func)(info);
629 irq_exit(); 630 irq_exit();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8c6c8c52b95c..f46a4d095e6c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -63,6 +63,7 @@
63#include <mach_apic.h> 63#include <mach_apic.h>
64#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
65#include <smpboot_hooks.h> 65#include <smpboot_hooks.h>
66#include <asm/vmi.h>
66 67
67/* Set if we find a B stepping CPU */ 68/* Set if we find a B stepping CPU */
68static int __devinitdata smp_b_stepping; 69static int __devinitdata smp_b_stepping;
@@ -545,12 +546,15 @@ static void __cpuinit start_secondary(void *unused)
545 * booting is too fragile that we want to limit the 546 * booting is too fragile that we want to limit the
546 * things done here to the most necessary things. 547 * things done here to the most necessary things.
547 */ 548 */
549#ifdef CONFIG_VMI
550 vmi_bringup();
551#endif
548 secondary_cpu_init(); 552 secondary_cpu_init();
549 preempt_disable(); 553 preempt_disable();
550 smp_callin(); 554 smp_callin();
551 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 555 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
552 rep_nop(); 556 rep_nop();
553 setup_secondary_APIC_clock(); 557 setup_secondary_clock();
554 if (nmi_watchdog == NMI_IO_APIC) { 558 if (nmi_watchdog == NMI_IO_APIC) {
555 disable_8259A_irq(0); 559 disable_8259A_irq(0);
556 enable_NMI_through_LVT0(NULL); 560 enable_NMI_through_LVT0(NULL);
@@ -619,7 +623,6 @@ extern struct {
619 unsigned short ss; 623 unsigned short ss;
620} stack_start; 624} stack_start;
621extern struct i386_pda *start_pda; 625extern struct i386_pda *start_pda;
622extern struct Xgt_desc_struct cpu_gdt_descr;
623 626
624#ifdef CONFIG_NUMA 627#ifdef CONFIG_NUMA
625 628
@@ -835,6 +838,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
835 num_starts = 0; 838 num_starts = 0;
836 839
837 /* 840 /*
841 * Paravirt / VMI wants a startup IPI hook here to set up the
842 * target processor state.
843 */
844 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
845 (unsigned long) stack_start.esp);
846
847 /*
838 * Run STARTUP IPI loop. 848 * Run STARTUP IPI loop.
839 */ 849 */
840 Dprintk("#startup loops: %d.\n", num_starts); 850 Dprintk("#startup loops: %d.\n", num_starts);
@@ -1320,7 +1330,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1320 1330
1321 smpboot_setup_io_apic(); 1331 smpboot_setup_io_apic();
1322 1332
1323 setup_boot_APIC_clock(); 1333 setup_boot_clock();
1324 1334
1325 /* 1335 /*
1326 * Synchronize the TSC with the AP 1336 * Synchronize the TSC with the AP
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index bc882a2b1db6..13ca54a85a1c 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -78,7 +78,7 @@ int __init sysenter_setup(void)
78 syscall_pages[0] = virt_to_page(syscall_page); 78 syscall_pages[0] = virt_to_page(syscall_page);
79 79
80#ifdef CONFIG_COMPAT_VDSO 80#ifdef CONFIG_COMPAT_VDSO
81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); 81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
83#endif 83#endif
84 84
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index c505b16c0990..a4f67a6e6821 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs)
131 unsigned long pc = instruction_pointer(regs); 131 unsigned long pc = instruction_pointer(regs);
132 132
133#ifdef CONFIG_SMP 133#ifdef CONFIG_SMP
134 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 134 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
135 in_lock_functions(pc)) {
135#ifdef CONFIG_FRAME_POINTER 136#ifdef CONFIG_FRAME_POINTER
136 return *(unsigned long *)(regs->ebp + 4); 137 return *(unsigned long *)(regs->ebp + 4);
137#else 138#else
138 unsigned long *sp; 139 unsigned long *sp = (unsigned long *)&regs->esp;
139 if ((regs->xcs & 3) == 0) 140
140 sp = (unsigned long *)&regs->esp;
141 else
142 sp = (unsigned long *)regs->esp;
143 /* Return address is either directly at stack pointer 141 /* Return address is either directly at stack pointer
144 or above a saved eflags. Eflags has bits 22-31 zero, 142 or above a saved eflags. Eflags has bits 22-31 zero,
145 kernel addresses don't. */ 143 kernel addresses don't. */
@@ -232,6 +230,7 @@ EXPORT_SYMBOL(get_cmos_time);
232static void sync_cmos_clock(unsigned long dummy); 230static void sync_cmos_clock(unsigned long dummy);
233 231
234static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 232static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
233int no_sync_cmos_clock;
235 234
236static void sync_cmos_clock(unsigned long dummy) 235static void sync_cmos_clock(unsigned long dummy)
237{ 236{
@@ -275,7 +274,8 @@ static void sync_cmos_clock(unsigned long dummy)
275 274
276void notify_arch_cmos_timer(void) 275void notify_arch_cmos_timer(void)
277{ 276{
278 mod_timer(&sync_cmos_timer, jiffies + 1); 277 if (!no_sync_cmos_clock)
278 mod_timer(&sync_cmos_timer, jiffies + 1);
279} 279}
280 280
281static long clock_cmos_diff; 281static long clock_cmos_diff;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 0efad8aeb41a..af0d3f70a817 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void);
94asmlinkage void machine_check(void); 94asmlinkage void machine_check(void);
95 95
96int kstack_depth_to_print = 24; 96int kstack_depth_to_print = 24;
97static unsigned int code_bytes = 64;
97ATOMIC_NOTIFIER_HEAD(i386die_chain); 98ATOMIC_NOTIFIER_HEAD(i386die_chain);
98 99
99int register_die_notifier(struct notifier_block *nb) 100int register_die_notifier(struct notifier_block *nb)
@@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs)
291 int i; 292 int i;
292 int in_kernel = 1; 293 int in_kernel = 1;
293 unsigned long esp; 294 unsigned long esp;
294 unsigned short ss; 295 unsigned short ss, gs;
295 296
296 esp = (unsigned long) (&regs->esp); 297 esp = (unsigned long) (&regs->esp);
297 savesegment(ss, ss); 298 savesegment(ss, ss);
299 savesegment(gs, gs);
298 if (user_mode_vm(regs)) { 300 if (user_mode_vm(regs)) {
299 in_kernel = 0; 301 in_kernel = 0;
300 esp = regs->esp; 302 esp = regs->esp;
@@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs)
313 regs->eax, regs->ebx, regs->ecx, regs->edx); 315 regs->eax, regs->ebx, regs->ecx, regs->edx);
314 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 316 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
315 regs->esi, regs->edi, regs->ebp, esp); 317 regs->esi, regs->edi, regs->ebp, esp);
316 printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", 318 printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
317 regs->xds & 0xffff, regs->xes & 0xffff, ss); 319 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
318 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 320 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
319 TASK_COMM_LEN, current->comm, current->pid, 321 TASK_COMM_LEN, current->comm, current->pid,
320 current_thread_info(), current, current->thread_info); 322 current_thread_info(), current, current->thread_info);
@@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs)
324 */ 326 */
325 if (in_kernel) { 327 if (in_kernel) {
326 u8 *eip; 328 u8 *eip;
327 int code_bytes = 64; 329 unsigned int code_prologue = code_bytes * 43 / 64;
330 unsigned int code_len = code_bytes;
328 unsigned char c; 331 unsigned char c;
329 332
330 printk("\n" KERN_EMERG "Stack: "); 333 printk("\n" KERN_EMERG "Stack: ");
@@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs)
332 335
333 printk(KERN_EMERG "Code: "); 336 printk(KERN_EMERG "Code: ");
334 337
335 eip = (u8 *)regs->eip - 43; 338 eip = (u8 *)regs->eip - code_prologue;
336 if (eip < (u8 *)PAGE_OFFSET || 339 if (eip < (u8 *)PAGE_OFFSET ||
337 probe_kernel_address(eip, c)) { 340 probe_kernel_address(eip, c)) {
338 /* try starting at EIP */ 341 /* try starting at EIP */
339 eip = (u8 *)regs->eip; 342 eip = (u8 *)regs->eip;
340 code_bytes = 32; 343 code_len = code_len - code_prologue + 1;
341 } 344 }
342 for (i = 0; i < code_bytes; i++, eip++) { 345 for (i = 0; i < code_len; i++, eip++) {
343 if (eip < (u8 *)PAGE_OFFSET || 346 if (eip < (u8 *)PAGE_OFFSET ||
344 probe_kernel_address(eip, c)) { 347 probe_kernel_address(eip, c)) {
345 printk(" Bad EIP value."); 348 printk(" Bad EIP value.");
@@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s)
1191 return 1; 1194 return 1;
1192} 1195}
1193__setup("kstack=", kstack_setup); 1196__setup("kstack=", kstack_setup);
1197
1198static int __init code_bytes_setup(char *s)
1199{
1200 code_bytes = simple_strtoul(s, NULL, 0);
1201 if (code_bytes > 8192)
1202 code_bytes = 8192;
1203
1204 return 1;
1205}
1206__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 2cfc7b09b925..46f752a8bbf3 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
23 * an extra value to store the TSC freq 23 * an extra value to store the TSC freq
24 */ 24 */
25unsigned int tsc_khz; 25unsigned int tsc_khz;
26unsigned long long (*custom_sched_clock)(void);
26 27
27int tsc_disable; 28int tsc_disable;
28 29
@@ -107,14 +108,14 @@ unsigned long long sched_clock(void)
107{ 108{
108 unsigned long long this_offset; 109 unsigned long long this_offset;
109 110
111 if (unlikely(custom_sched_clock))
112 return (*custom_sched_clock)();
113
110 /* 114 /*
111 * in the NUMA case we dont use the TSC as they are not 115 * Fall back to jiffies if there's no TSC available:
112 * synchronized across all CPUs.
113 */ 116 */
114#ifndef CONFIG_NUMA 117 if (unlikely(tsc_disable))
115 if (!cpu_khz || check_tsc_unstable()) 118 /* No locking but a rare wrong value is not a big deal: */
116#endif
117 /* no locking but a rare wrong value is not a big deal */
118 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 119 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
119 120
120 /* read the Time Stamp Counter: */ 121 /* read the Time Stamp Counter: */
@@ -194,13 +195,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
194void __init tsc_init(void) 195void __init tsc_init(void)
195{ 196{
196 if (!cpu_has_tsc || tsc_disable) 197 if (!cpu_has_tsc || tsc_disable)
197 return; 198 goto out_no_tsc;
198 199
199 cpu_khz = calculate_cpu_khz(); 200 cpu_khz = calculate_cpu_khz();
200 tsc_khz = cpu_khz; 201 tsc_khz = cpu_khz;
201 202
202 if (!cpu_khz) 203 if (!cpu_khz)
203 return; 204 goto out_no_tsc;
204 205
205 printk("Detected %lu.%03lu MHz processor.\n", 206 printk("Detected %lu.%03lu MHz processor.\n",
206 (unsigned long)cpu_khz / 1000, 207 (unsigned long)cpu_khz / 1000,
@@ -208,6 +209,15 @@ void __init tsc_init(void)
208 209
209 set_cyc2ns_scale(cpu_khz); 210 set_cyc2ns_scale(cpu_khz);
210 use_tsc_delay(); 211 use_tsc_delay();
212 return;
213
214out_no_tsc:
215 /*
216 * Set the tsc_disable flag if there's no TSC support, this
217 * makes it a fast flag for the kernel to see whether it
218 * should be using the TSC.
219 */
220 tsc_disable = 1;
211} 221}
212 222
213#ifdef CONFIG_CPU_FREQ 223#ifdef CONFIG_CPU_FREQ
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index be2f96e67f78..d1b8f2b7aea6 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
96{ 96{
97 int ret = 0; 97 int ret = 0;
98 98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to 99 /* kernel_vm86_regs is missing xgs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */ 100 (but not including) orig_eax, and then rest including orig_eax. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); 101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs, 102 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
103 sizeof(struct kernel_vm86_regs) - 103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs)); 104 offsetof(struct kernel_vm86_regs, pt.orig_eax));
105 105
106 return ret; 106 return ret;
107} 107}
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
113{ 113{
114 int ret = 0; 114 int ret = 0;
115 115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); 116 /* copy eax-xfs inclusive */
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs, 117 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
118 /* copy orig_eax-__gsh+extra */
119 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
118 sizeof(struct kernel_vm86_regs) - 120 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) + 121 offsetof(struct kernel_vm86_regs, pt.orig_eax) +
120 extra); 122 extra);
121
122 return ret; 123 return ret;
123} 124}
124 125
@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
157 158
158 ret = KVM86->regs32; 159 ret = KVM86->regs32;
159 160
160 loadsegment(fs, current->thread.saved_fs); 161 ret->xfs = current->thread.saved_fs;
161 ret->xgs = current->thread.saved_gs; 162 loadsegment(gs, current->thread.saved_gs);
162 163
163 return ret; 164 return ret;
164} 165}
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
285 */ 286 */
286 info->regs.pt.xds = 0; 287 info->regs.pt.xds = 0;
287 info->regs.pt.xes = 0; 288 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0; 289 info->regs.pt.xfs = 0;
289 290
290/* we are clearing fs later just before "jmp resume_userspace", 291/* we are clearing gs later just before "jmp resume_userspace",
291 * because it is not saved/restored. 292 * because it is not saved/restored.
292 */ 293 */
293 294
@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
321 */ 322 */
322 info->regs32->eax = 0; 323 info->regs32->eax = 0;
323 tsk->thread.saved_esp0 = tsk->thread.esp0; 324 tsk->thread.saved_esp0 = tsk->thread.esp0;
324 savesegment(fs, tsk->thread.saved_fs); 325 tsk->thread.saved_fs = info->regs32->xfs;
325 tsk->thread.saved_gs = info->regs32->xgs; 326 savesegment(gs, tsk->thread.saved_gs);
326 327
327 tss = &per_cpu(init_tss, get_cpu()); 328 tss = &per_cpu(init_tss, get_cpu());
328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 329 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
342 __asm__ __volatile__( 343 __asm__ __volatile__(
343 "movl %0,%%esp\n\t" 344 "movl %0,%%esp\n\t"
344 "movl %1,%%ebp\n\t" 345 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t" 346 "mov %2, %%gs\n\t"
346 "jmp resume_userspace" 347 "jmp resume_userspace"
347 : /* no outputs */ 348 : /* no outputs */
348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
new file mode 100644
index 000000000000..bb5a7abf949c
--- /dev/null
+++ b/arch/i386/kernel/vmi.c
@@ -0,0 +1,949 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/license.h>
27#include <linux/cpu.h>
28#include <linux/bootmem.h>
29#include <linux/mm.h>
30#include <asm/vmi.h>
31#include <asm/io.h>
32#include <asm/fixmap.h>
33#include <asm/apicdef.h>
34#include <asm/apic.h>
35#include <asm/processor.h>
36#include <asm/timer.h>
37#include <asm/vmi_time.h>
38
39/* Convenient for calling VMI functions indirectly in the ROM */
40typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
41typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
42
43#define call_vrom_func(rom,func) \
44 (((VROMFUNC *)(rom->func))())
45
46#define call_vrom_long_func(rom,func,arg) \
47 (((VROMLONGFUNC *)(rom->func)) (arg))
48
49static struct vrom_header *vmi_rom;
50static int license_gplok;
51static int disable_nodelay;
52static int disable_pge;
53static int disable_pse;
54static int disable_sep;
55static int disable_tsc;
56static int disable_mtrr;
57
58/* Cached VMI operations */
59struct {
60 void (*cpuid)(void /* non-c */);
61 void (*_set_ldt)(u32 selector);
62 void (*set_tr)(u32 selector);
63 void (*set_kernel_stack)(u32 selector, u32 esp0);
64 void (*allocate_page)(u32, u32, u32, u32, u32);
65 void (*release_page)(u32, u32);
66 void (*set_pte)(pte_t, pte_t *, unsigned);
67 void (*update_pte)(pte_t *, unsigned);
68 void (*set_linear_mapping)(int, u32, u32, u32);
69 void (*flush_tlb)(int);
70 void (*set_initial_ap_state)(int, int);
71 void (*halt)(void);
72} vmi_ops;
73
74/* XXX move this to alternative.h */
75extern struct paravirt_patch __start_parainstructions[],
76 __stop_parainstructions[];
77
78/*
79 * VMI patching routines.
80 */
81#define MNEM_CALL 0xe8
82#define MNEM_JMP 0xe9
83#define MNEM_RET 0xc3
84
85static char irq_save_disable_callout[] = {
86 MNEM_CALL, 0, 0, 0, 0,
87 MNEM_CALL, 0, 0, 0, 0,
88 MNEM_RET
89};
90#define IRQ_PATCH_INT_MASK 0
91#define IRQ_PATCH_DISABLE 5
92
93static inline void patch_offset(unsigned char *eip, unsigned char *dest)
94{
95 *(unsigned long *)(eip+1) = dest-eip-5;
96}
97
98static unsigned patch_internal(int call, unsigned len, void *insns)
99{
100 u64 reloc;
101 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
102 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
103 switch(rel->type) {
104 case VMI_RELOCATION_CALL_REL:
105 BUG_ON(len < 5);
106 *(char *)insns = MNEM_CALL;
107 patch_offset(insns, rel->eip);
108 return 5;
109
110 case VMI_RELOCATION_JUMP_REL:
111 BUG_ON(len < 5);
112 *(char *)insns = MNEM_JMP;
113 patch_offset(insns, rel->eip);
114 return 5;
115
116 case VMI_RELOCATION_NOP:
117 /* obliterate the whole thing */
118 return 0;
119
120 case VMI_RELOCATION_NONE:
121 /* leave native code in place */
122 break;
123
124 default:
125 BUG();
126 }
127 return len;
128}
129
130/*
131 * Apply patch if appropriate, return length of new instruction
132 * sequence. The callee does nop padding for us.
133 */
134static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
135{
136 switch (type) {
137 case PARAVIRT_IRQ_DISABLE:
138 return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
139 case PARAVIRT_IRQ_ENABLE:
140 return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
141 case PARAVIRT_RESTORE_FLAGS:
142 return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
143 case PARAVIRT_SAVE_FLAGS:
144 return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
145 case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
146 if (len >= 10) {
147 patch_internal(VMI_CALL_GetInterruptMask, len, insns);
148 patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
149 return 10;
150 } else {
151 /*
152 * You bastards didn't leave enough room to
153 * patch save_flags_irq_disable inline. Patch
154 * to a helper
155 */
156 BUG_ON(len < 5);
157 *(char *)insns = MNEM_CALL;
158 patch_offset(insns, irq_save_disable_callout);
159 return 5;
160 }
161 case PARAVIRT_INTERRUPT_RETURN:
162 return patch_internal(VMI_CALL_IRET, len, insns);
163 case PARAVIRT_STI_SYSEXIT:
164 return patch_internal(VMI_CALL_SYSEXIT, len, insns);
165 default:
166 break;
167 }
168 return len;
169}
170
171/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
172static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
173 unsigned int *ecx, unsigned int *edx)
174{
175 int override = 0;
176 if (*eax == 1)
177 override = 1;
178 asm volatile ("call *%6"
179 : "=a" (*eax),
180 "=b" (*ebx),
181 "=c" (*ecx),
182 "=d" (*edx)
183 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
184 if (override) {
185 if (disable_pse)
186 *edx &= ~X86_FEATURE_PSE;
187 if (disable_pge)
188 *edx &= ~X86_FEATURE_PGE;
189 if (disable_sep)
190 *edx &= ~X86_FEATURE_SEP;
191 if (disable_tsc)
192 *edx &= ~X86_FEATURE_TSC;
193 if (disable_mtrr)
194 *edx &= ~X86_FEATURE_MTRR;
195 }
196}
197
198static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
199{
200 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
201 write_gdt_entry(gdt, nr, new->a, new->b);
202}
203
204static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
205{
206 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
207 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
208 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
209 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
210}
211
212static void vmi_set_ldt(const void *addr, unsigned entries)
213{
214 unsigned cpu = smp_processor_id();
215 u32 low, high;
216
217 pack_descriptor(&low, &high, (unsigned long)addr,
218 entries * sizeof(struct desc_struct) - 1,
219 DESCTYPE_LDT, 0);
220 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
221 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
222}
223
224static void vmi_set_tr(void)
225{
226 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
227}
228
229static void vmi_load_esp0(struct tss_struct *tss,
230 struct thread_struct *thread)
231{
232 tss->esp0 = thread->esp0;
233
234 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
235 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
236 tss->ss1 = thread->sysenter_cs;
237 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
238 }
239 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
240}
241
242static void vmi_flush_tlb_user(void)
243{
244 vmi_ops.flush_tlb(VMI_FLUSH_TLB);
245}
246
247static void vmi_flush_tlb_kernel(void)
248{
249 vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
250}
251
252/* Stub to do nothing at all; used for delays and unimplemented calls */
253static void vmi_nop(void)
254{
255}
256
257/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
258#ifdef CONFIG_NO_IDLE_HZ
259static fastcall void vmi_safe_halt(void)
260{
261 int idle = vmi_stop_hz_timer();
262 vmi_ops.halt();
263 if (idle) {
264 local_irq_disable();
265 vmi_account_time_restart_hz_timer();
266 local_irq_enable();
267 }
268}
269#endif
270
271#ifdef CONFIG_DEBUG_PAGE_TYPE
272
273#ifdef CONFIG_X86_PAE
274#define MAX_BOOT_PTS (2048+4+1)
275#else
276#define MAX_BOOT_PTS (1024+1)
277#endif
278
279/*
280 * During boot, mem_map is not yet available in paging_init, so stash
281 * all the boot page allocations here.
282 */
283static struct {
284 u32 pfn;
285 int type;
286} boot_page_allocations[MAX_BOOT_PTS];
287static int num_boot_page_allocations;
288static int boot_allocations_applied;
289
290void vmi_apply_boot_page_allocations(void)
291{
292 int i;
293 BUG_ON(!mem_map);
294 for (i = 0; i < num_boot_page_allocations; i++) {
295 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
296 page->type = boot_page_allocations[i].type;
297 page->type = boot_page_allocations[i].type &
298 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
299 }
300 boot_allocations_applied = 1;
301}
302
303static void record_page_type(u32 pfn, int type)
304{
305 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
306 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
307 boot_page_allocations[num_boot_page_allocations].type = type;
308 num_boot_page_allocations++;
309}
310
311static void check_zeroed_page(u32 pfn, int type, struct page *page)
312{
313 u32 *ptr;
314 int i;
315 int limit = PAGE_SIZE / sizeof(int);
316
317 if (page_address(page))
318 ptr = (u32 *)page_address(page);
319 else
320 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
321 /*
322 * When cloning the root in non-PAE mode, only the userspace
323 * pdes need to be zeroed.
324 */
325 if (type & VMI_PAGE_CLONE)
326 limit = USER_PTRS_PER_PGD;
327 for (i = 0; i < limit; i++)
328 BUG_ON(ptr[i]);
329}
330
331/*
332 * We stash the page type into struct page so we can verify the page
333 * types are used properly.
334 */
335static void vmi_set_page_type(u32 pfn, int type)
336{
337 /* PAE can have multiple roots per page - don't track */
338 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
339 return;
340
341 if (boot_allocations_applied) {
342 struct page *page = pfn_to_page(pfn);
343 if (type != VMI_PAGE_NORMAL)
344 BUG_ON(page->type);
345 else
346 BUG_ON(page->type == VMI_PAGE_NORMAL);
347 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
348 if (type & VMI_PAGE_ZEROED)
349 check_zeroed_page(pfn, type, page);
350 } else {
351 record_page_type(pfn, type);
352 }
353}
354
355static void vmi_check_page_type(u32 pfn, int type)
356{
357 /* PAE can have multiple roots per page - skip checks */
358 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
359 return;
360
361 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
362 if (boot_allocations_applied) {
363 struct page *page = pfn_to_page(pfn);
364 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
365 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
366 BUG_ON((type & page->type) == 0);
367 }
368}
369#else
370#define vmi_set_page_type(p,t) do { } while (0)
371#define vmi_check_page_type(p,t) do { } while (0)
372#endif
373
374static void vmi_allocate_pt(u32 pfn)
375{
376 vmi_set_page_type(pfn, VMI_PAGE_L1);
377 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
378}
379
380static void vmi_allocate_pd(u32 pfn)
381{
382 /*
383 * This call comes in very early, before mem_map is setup.
384 * It is called only for swapper_pg_dir, which already has
385 * data on it.
386 */
387 vmi_set_page_type(pfn, VMI_PAGE_L2);
388 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
389}
390
391static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
392{
393 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
394 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
395 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
396}
397
398static void vmi_release_pt(u32 pfn)
399{
400 vmi_ops.release_page(pfn, VMI_PAGE_L1);
401 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
402}
403
404static void vmi_release_pd(u32 pfn)
405{
406 vmi_ops.release_page(pfn, VMI_PAGE_L2);
407 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
408}
409
410/*
411 * Helper macros for MMU update flags. We can defer updates until a flush
412 * or page invalidation only if the update is to the current address space
413 * (otherwise, there is no flush). We must check against init_mm, since
414 * this could be a kernel update, which usually passes init_mm, although
415 * sometimes this check can be skipped if we know the particular function
416 * is only called on user mode PTEs. We could change the kernel to pass
417 * current->active_mm here, but in particular, I was unsure if changing
418 * mm/highmem.c to do this would still be correct on other architectures.
419 */
420#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
421 (!mustbeuser && (mm) == &init_mm))
422#define vmi_flags_addr(mm, addr, level, user) \
423 ((level) | (is_current_as(mm, user) ? \
424 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
425#define vmi_flags_addr_defer(mm, addr, level, user) \
426 ((level) | (is_current_as(mm, user) ? \
427 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
428
429static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
430{
431 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
432 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
433}
434
435static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
436{
437 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
438 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
439}
440
441static void vmi_set_pte(pte_t *ptep, pte_t pte)
442{
443 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
444 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
445 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
446}
447
448static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
449{
450 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
451 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
452}
453
454static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
455{
456#ifdef CONFIG_X86_PAE
457 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
458 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
459#else
460 const pte_t pte = { pmdval.pud.pgd.pgd };
461 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
462#endif
463 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
464}
465
466#ifdef CONFIG_X86_PAE
467
468static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
469{
470 /*
471 * XXX This is called from set_pmd_pte, but at both PT
472 * and PD layers so the VMI_PAGE_PT flag is wrong. But
473 * it is only called for large page mapping changes,
474 * the Xen backend, doesn't support large pages, and the
475 * ESX backend doesn't depend on the flag.
476 */
477 set_64bit((unsigned long long *)ptep,pte_val(pteval));
478 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
479}
480
481static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
482{
483 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
484 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
485}
486
487static void vmi_set_pud(pud_t *pudp, pud_t pudval)
488{
489 /* Um, eww */
490 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
491 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
492 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
493}
494
495static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
496{
497 const pte_t pte = { 0 };
498 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
499 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
500}
501
502void vmi_pmd_clear(pmd_t *pmd)
503{
504 const pte_t pte = { 0 };
505 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
506 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
507}
508#endif
509
510#ifdef CONFIG_SMP
511struct vmi_ap_state ap;
512extern void setup_pda(void);
513
514static void __init /* XXX cpu hotplug */
515vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
516 unsigned long start_esp)
517{
518 /* Default everything to zero. This is fine for most GPRs. */
519 memset(&ap, 0, sizeof(struct vmi_ap_state));
520
521 ap.gdtr_limit = GDT_SIZE - 1;
522 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
523
524 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
525 ap.idtr_base = (unsigned long) idt_table;
526
527 ap.ldtr = 0;
528
529 ap.cs = __KERNEL_CS;
530 ap.eip = (unsigned long) start_eip;
531 ap.ss = __KERNEL_DS;
532 ap.esp = (unsigned long) start_esp;
533
534 ap.ds = __USER_DS;
535 ap.es = __USER_DS;
536 ap.fs = __KERNEL_PDA;
537 ap.gs = 0;
538
539 ap.eflags = 0;
540
541 setup_pda();
542
543#ifdef CONFIG_X86_PAE
544 /* efer should match BSP efer. */
545 if (cpu_has_nx) {
546 unsigned l, h;
547 rdmsr(MSR_EFER, l, h);
548 ap.efer = (unsigned long long) h << 32 | l;
549 }
550#endif
551
552 ap.cr3 = __pa(swapper_pg_dir);
553 /* Protected mode, paging, AM, WP, NE, MP. */
554 ap.cr0 = 0x80050023;
555 ap.cr4 = mmu_cr4_features;
556 vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
557}
558#endif
559
560static inline int __init check_vmi_rom(struct vrom_header *rom)
561{
562 struct pci_header *pci;
563 struct pnp_header *pnp;
564 const char *manufacturer = "UNKNOWN";
565 const char *product = "UNKNOWN";
566 const char *license = "unspecified";
567
568 if (rom->rom_signature != 0xaa55)
569 return 0;
570 if (rom->vrom_signature != VMI_SIGNATURE)
571 return 0;
572 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
573 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
574 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
575 rom->api_version_maj,
576 rom->api_version_min);
577 return 0;
578 }
579
580 /*
581 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
582 * the PCI header and device type to make sure this is really a
583 * VMI device.
584 */
585 if (!rom->pci_header_offs) {
586 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
587 return 0;
588 }
589
590 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
591 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
592 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
593 /* Allow it to run... anyways, but warn */
594 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
595 }
596
597 if (rom->pnp_header_offs) {
598 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
599 if (pnp->manufacturer_offset)
600 manufacturer = (const char *)rom+pnp->manufacturer_offset;
601 if (pnp->product_offset)
602 product = (const char *)rom+pnp->product_offset;
603 }
604
605 if (rom->license_offs)
606 license = (char *)rom+rom->license_offs;
607
608 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
609 manufacturer, product,
610 rom->api_version_maj, rom->api_version_min,
611 pci->rom_version_maj, pci->rom_version_min);
612
613 license_gplok = license_is_gpl_compatible(license);
614 if (!license_gplok) {
615 printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
616 "inlining disabled\n",
617 license);
618 add_taint(TAINT_PROPRIETARY_MODULE);
619 }
620 return 1;
621}
622
623/*
624 * Probe for the VMI option ROM
625 */
626static inline int __init probe_vmi_rom(void)
627{
628 unsigned long base;
629
630 /* VMI ROM is in option ROM area, check signature */
631 for (base = 0xC0000; base < 0xE0000; base += 2048) {
632 struct vrom_header *romstart;
633 romstart = (struct vrom_header *)isa_bus_to_virt(base);
634 if (check_vmi_rom(romstart)) {
635 vmi_rom = romstart;
636 return 1;
637 }
638 }
639 return 0;
640}
641
642/*
643 * VMI setup common to all processors
644 */
645void vmi_bringup(void)
646{
647 /* We must establish the lowmem mapping for MMU ops to work */
648 if (vmi_rom)
649 vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
650}
651
652/*
653 * Return a pointer to the VMI function or a NOP stub
654 */
655static void *vmi_get_function(int vmicall)
656{
657 u64 reloc;
658 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
659 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
660 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
661 if (rel->type == VMI_RELOCATION_CALL_REL)
662 return (void *)rel->eip;
663 else
664 return (void *)vmi_nop;
665}
666
667/*
668 * Helper macro for making the VMI paravirt-ops fill code readable.
669 * For unimplemented operations, fall back to default.
670 */
671#define para_fill(opname, vmicall) \
672do { \
673 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
674 VMI_CALL_##vmicall); \
675 if (rel->type != VMI_RELOCATION_NONE) { \
676 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \
677 paravirt_ops.opname = (void *)rel->eip; \
678 } \
679} while (0)
680
681/*
682 * Activate the VMI interface and switch into paravirtualized mode
683 */
684static inline int __init activate_vmi(void)
685{
686 short kernel_cs;
687 u64 reloc;
688 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
689
690 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
691 printk(KERN_ERR "VMI ROM failed to initialize!");
692 return 0;
693 }
694 savesegment(cs, kernel_cs);
695
696 paravirt_ops.paravirt_enabled = 1;
697 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
698
699 paravirt_ops.patch = vmi_patch;
700 paravirt_ops.name = "vmi";
701
702 /*
703 * Many of these operations are ABI compatible with VMI.
704 * This means we can fill in the paravirt-ops with direct
705 * pointers into the VMI ROM. If the calling convention for
706 * these operations changes, this code needs to be updated.
707 *
708 * Exceptions
709 * CPUID paravirt-op uses pointers, not the native ISA
710 * halt has no VMI equivalent; all VMI halts are "safe"
711 * no MSR support yet - just trap and emulate. VMI uses the
712 * same ABI as the native ISA, but Linux wants exceptions
713 * from bogus MSR read / write handled
714 * rdpmc is not yet used in Linux
715 */
716
717 /* CPUID is special, so very special */
718 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID);
719 if (rel->type != VMI_RELOCATION_NONE) {
720 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
721 vmi_ops.cpuid = (void *)rel->eip;
722 paravirt_ops.cpuid = vmi_cpuid;
723 }
724
725 para_fill(clts, CLTS);
726 para_fill(get_debugreg, GetDR);
727 para_fill(set_debugreg, SetDR);
728 para_fill(read_cr0, GetCR0);
729 para_fill(read_cr2, GetCR2);
730 para_fill(read_cr3, GetCR3);
731 para_fill(read_cr4, GetCR4);
732 para_fill(write_cr0, SetCR0);
733 para_fill(write_cr2, SetCR2);
734 para_fill(write_cr3, SetCR3);
735 para_fill(write_cr4, SetCR4);
736 para_fill(save_fl, GetInterruptMask);
737 para_fill(restore_fl, SetInterruptMask);
738 para_fill(irq_disable, DisableInterrupts);
739 para_fill(irq_enable, EnableInterrupts);
740 /* irq_save_disable !!! sheer pain */
741 patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
742 (char *)paravirt_ops.save_fl);
743 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
744 (char *)paravirt_ops.irq_disable);
745#ifndef CONFIG_NO_IDLE_HZ
746 para_fill(safe_halt, Halt);
747#else
748 vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
749 paravirt_ops.safe_halt = vmi_safe_halt;
750#endif
751 para_fill(wbinvd, WBINVD);
752 /* paravirt_ops.read_msr = vmi_rdmsr */
753 /* paravirt_ops.write_msr = vmi_wrmsr */
754 para_fill(read_tsc, RDTSC);
755 /* paravirt_ops.rdpmc = vmi_rdpmc */
756
757 /* TR interface doesn't pass TR value */
758 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR);
759 if (rel->type != VMI_RELOCATION_NONE) {
760 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
761 vmi_ops.set_tr = (void *)rel->eip;
762 paravirt_ops.load_tr_desc = vmi_set_tr;
763 }
764
765 /* LDT is special, too */
766 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT);
767 if (rel->type != VMI_RELOCATION_NONE) {
768 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
769 vmi_ops._set_ldt = (void *)rel->eip;
770 paravirt_ops.set_ldt = vmi_set_ldt;
771 }
772
773 para_fill(load_gdt, SetGDT);
774 para_fill(load_idt, SetIDT);
775 para_fill(store_gdt, GetGDT);
776 para_fill(store_idt, GetIDT);
777 para_fill(store_tr, GetTR);
778 paravirt_ops.load_tls = vmi_load_tls;
779 para_fill(write_ldt_entry, WriteLDTEntry);
780 para_fill(write_gdt_entry, WriteGDTEntry);
781 para_fill(write_idt_entry, WriteIDTEntry);
782 reloc = call_vrom_long_func(vmi_rom, get_reloc,
783 VMI_CALL_UpdateKernelStack);
784 if (rel->type != VMI_RELOCATION_NONE) {
785 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
786 vmi_ops.set_kernel_stack = (void *)rel->eip;
787 paravirt_ops.load_esp0 = vmi_load_esp0;
788 }
789
790 para_fill(set_iopl_mask, SetIOPLMask);
791 paravirt_ops.io_delay = (void *)vmi_nop;
792 if (!disable_nodelay) {
793 paravirt_ops.const_udelay = (void *)vmi_nop;
794 }
795
796 para_fill(set_lazy_mode, SetLazyMode);
797
798 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
799 if (rel->type != VMI_RELOCATION_NONE) {
800 vmi_ops.flush_tlb = (void *)rel->eip;
801 paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
802 paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
803 }
804 para_fill(flush_tlb_single, InvalPage);
805
806 /*
807 * Until a standard flag format can be agreed on, we need to
808 * implement these as wrappers in Linux. Get the VMI ROM
809 * function pointers for the two backend calls.
810 */
811#ifdef CONFIG_X86_PAE
812 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
813 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
814#else
815 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
816 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
817#endif
818 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
819 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
820 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
821
822 paravirt_ops.alloc_pt = vmi_allocate_pt;
823 paravirt_ops.alloc_pd = vmi_allocate_pd;
824 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
825 paravirt_ops.release_pt = vmi_release_pt;
826 paravirt_ops.release_pd = vmi_release_pd;
827 paravirt_ops.set_pte = vmi_set_pte;
828 paravirt_ops.set_pte_at = vmi_set_pte_at;
829 paravirt_ops.set_pmd = vmi_set_pmd;
830 paravirt_ops.pte_update = vmi_update_pte;
831 paravirt_ops.pte_update_defer = vmi_update_pte_defer;
832#ifdef CONFIG_X86_PAE
833 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
834 paravirt_ops.set_pte_present = vmi_set_pte_present;
835 paravirt_ops.set_pud = vmi_set_pud;
836 paravirt_ops.pte_clear = vmi_pte_clear;
837 paravirt_ops.pmd_clear = vmi_pmd_clear;
838#endif
839 /*
840 * These MUST always be patched. Don't support indirect jumps
841 * through these operations, as the VMI interface may use either
842 * a jump or a call to get to these operations, depending on
843 * the backend. They are performance critical anyway, so requiring
844 * a patch is not a big problem.
845 */
846 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
847 paravirt_ops.iret = (void *)0xbadbab0;
848
849#ifdef CONFIG_SMP
850 paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
851 vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
852#endif
853
854#ifdef CONFIG_X86_LOCAL_APIC
855 paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
856 paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
857 paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
858#endif
859
860 /*
861 * Check for VMI timer functionality by probing for a cycle frequency method
862 */
863 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
864 if (rel->type != VMI_RELOCATION_NONE) {
865 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
866 vmi_timer_ops.get_cycle_counter =
867 vmi_get_function(VMI_CALL_GetCycleCounter);
868 vmi_timer_ops.get_wallclock =
869 vmi_get_function(VMI_CALL_GetWallclockTime);
870 vmi_timer_ops.wallclock_updated =
871 vmi_get_function(VMI_CALL_WallclockUpdated);
872 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
873 vmi_timer_ops.cancel_alarm =
874 vmi_get_function(VMI_CALL_CancelAlarm);
875 paravirt_ops.time_init = vmi_time_init;
876 paravirt_ops.get_wallclock = vmi_get_wallclock;
877 paravirt_ops.set_wallclock = vmi_set_wallclock;
878#ifdef CONFIG_X86_LOCAL_APIC
879 paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
880 paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
881#endif
882 custom_sched_clock = vmi_sched_clock;
883 }
884
885 /*
886 * Alternative instruction rewriting doesn't happen soon enough
887 * to convert VMI_IRET to a call instead of a jump; so we have
888 * to do this before IRQs get reenabled. Fortunately, it is
889 * idempotent.
890 */
891 apply_paravirt(__start_parainstructions, __stop_parainstructions);
892
893 vmi_bringup();
894
895 return 1;
896}
897
898#undef para_fill
899
900void __init vmi_init(void)
901{
902 unsigned long flags;
903
904 if (!vmi_rom)
905 probe_vmi_rom();
906 else
907 check_vmi_rom(vmi_rom);
908
909 /* In case probing for or validating the ROM failed, basil */
910 if (!vmi_rom)
911 return;
912
913 reserve_top_address(-vmi_rom->virtual_top);
914
915 local_irq_save(flags);
916 activate_vmi();
917#ifdef CONFIG_SMP
918 no_timer_check = 1;
919#endif
920 local_irq_restore(flags & X86_EFLAGS_IF);
921}
922
923static int __init parse_vmi(char *arg)
924{
925 if (!arg)
926 return -EINVAL;
927
928 if (!strcmp(arg, "disable_nodelay"))
929 disable_nodelay = 1;
930 else if (!strcmp(arg, "disable_pge")) {
931 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
932 disable_pge = 1;
933 } else if (!strcmp(arg, "disable_pse")) {
934 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
935 disable_pse = 1;
936 } else if (!strcmp(arg, "disable_sep")) {
937 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
938 disable_sep = 1;
939 } else if (!strcmp(arg, "disable_tsc")) {
940 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
941 disable_tsc = 1;
942 } else if (!strcmp(arg, "disable_mtrr")) {
943 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
944 disable_mtrr = 1;
945 }
946 return 0;
947}
948
949early_param("vmi", parse_vmi);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
new file mode 100644
index 000000000000..2e2d8dbcbd68
--- /dev/null
+++ b/arch/i386/kernel/vmitime.c
@@ -0,0 +1,499 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25/*
26 * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
27 * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
28 * See comments there for proper credits.
29 */
30
31#include <linux/spinlock.h>
32#include <linux/init.h>
33#include <linux/errno.h>
34#include <linux/jiffies.h>
35#include <linux/interrupt.h>
36#include <linux/kernel_stat.h>
37#include <linux/rcupdate.h>
38#include <linux/clocksource.h>
39
40#include <asm/timer.h>
41#include <asm/io.h>
42#include <asm/apic.h>
43#include <asm/div64.h>
44#include <asm/timer.h>
45#include <asm/desc.h>
46
47#include <asm/vmi.h>
48#include <asm/vmi_time.h>
49
50#include <mach_timer.h>
51#include <io_ports.h>
52
53#ifdef CONFIG_X86_LOCAL_APIC
54#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
55#else
56#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
57#endif
58
59/* Cached VMI operations */
60struct vmi_timer_ops vmi_timer_ops;
61
62#ifdef CONFIG_NO_IDLE_HZ
63
64/* /proc/sys/kernel/hz_timer state. */
65int sysctl_hz_timer;
66
67/* Some stats */
68static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
69static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
70static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
71
72#endif /* CONFIG_NO_IDLE_HZ */
73
74/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
75static int alarm_hz = CONFIG_VMI_ALARM_HZ;
76
77/* Cache of the value get_cycle_frequency / HZ. */
78static signed long long cycles_per_jiffy;
79
80/* Cache of the value get_cycle_frequency / alarm_hz. */
81static signed long long cycles_per_alarm;
82
83/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
84 * Protected by xtime_lock. */
85static unsigned long long real_cycles_accounted_system;
86
87/* The number of cycles accounted for by update_process_times(), per cpu. */
88static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
89
90/* The number of stolen cycles accounted, per cpu. */
91static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
92
93/* Clock source. */
94static cycle_t read_real_cycles(void)
95{
96 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
97}
98
99static cycle_t read_available_cycles(void)
100{
101 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
102}
103
104#if 0
105static cycle_t read_stolen_cycles(void)
106{
107 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
108}
109#endif /* 0 */
110
111static struct clocksource clocksource_vmi = {
112 .name = "vmi-timer",
113 .rating = 450,
114 .read = read_real_cycles,
115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */
117 .shift = 22,
118 .is_continuous = 1,
119};
120
121
122/* Timer interrupt handler. */
123static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
124
125static struct irqaction vmi_timer_irq = {
126 vmi_timer_interrupt,
127 SA_INTERRUPT,
128 CPU_MASK_NONE,
129 "VMI-alarm",
130 NULL,
131 NULL
132};
133
134/* Alarm rate */
135static int __init vmi_timer_alarm_rate_setup(char* str)
136{
137 int alarm_rate;
138 if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
139 alarm_hz = alarm_rate;
140 printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
141 }
142 return 1;
143}
144__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
145
146
147/* Initialization */
148static void vmi_get_wallclock_ts(struct timespec *ts)
149{
150 unsigned long long wallclock;
151 wallclock = vmi_timer_ops.get_wallclock(); // nsec units
152 ts->tv_nsec = do_div(wallclock, 1000000000);
153 ts->tv_sec = wallclock;
154}
155
156static void update_xtime_from_wallclock(void)
157{
158 struct timespec ts;
159 vmi_get_wallclock_ts(&ts);
160 do_settimeofday(&ts);
161}
162
163unsigned long vmi_get_wallclock(void)
164{
165 struct timespec ts;
166 vmi_get_wallclock_ts(&ts);
167 return ts.tv_sec;
168}
169
170int vmi_set_wallclock(unsigned long now)
171{
172 return -1;
173}
174
175unsigned long long vmi_sched_clock(void)
176{
177 return read_available_cycles();
178}
179
180void __init vmi_time_init(void)
181{
182 unsigned long long cycles_per_sec, cycles_per_msec;
183 unsigned long flags;
184
185 local_irq_save(flags);
186 setup_irq(0, &vmi_timer_irq);
187#ifdef CONFIG_X86_LOCAL_APIC
188 set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
189#endif
190
191 no_sync_cmos_clock = 1;
192
193 vmi_get_wallclock_ts(&xtime);
194 set_normalized_timespec(&wall_to_monotonic,
195 -xtime.tv_sec, -xtime.tv_nsec);
196
197 real_cycles_accounted_system = read_real_cycles();
198 update_xtime_from_wallclock();
199 per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
200
201 cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
202
203 cycles_per_jiffy = cycles_per_sec;
204 (void)do_div(cycles_per_jiffy, HZ);
205 cycles_per_alarm = cycles_per_sec;
206 (void)do_div(cycles_per_alarm, alarm_hz);
207 cycles_per_msec = cycles_per_sec;
208 (void)do_div(cycles_per_msec, 1000);
209 cpu_khz = cycles_per_msec;
210
211 printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
212 "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
213 cycles_per_alarm);
214
215 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
216 clocksource_vmi.shift);
217 if (clocksource_register(&clocksource_vmi))
218 printk(KERN_WARNING "Error registering VMITIME clocksource.");
219
220 /* Disable PIT. */
221 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
222
223 /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
224 * reduce the latency calling update_process_times. */
225 vmi_timer_ops.set_alarm(
226 VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
227 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
228 cycles_per_alarm);
229
230 local_irq_restore(flags);
231}
232
233#ifdef CONFIG_X86_LOCAL_APIC
234
235void __init vmi_timer_setup_boot_alarm(void)
236{
237 local_irq_disable();
238
239 /* Route the interrupt to the correct vector. */
240 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
241
242 /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
243 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
244 vmi_timer_ops.set_alarm(
245 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
246 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
247 cycles_per_alarm);
248 local_irq_enable();
249}
250
251/* Initialize the time accounting variables for an AP on an SMP system.
252 * Also, set the local alarm for the AP. */
253void __init vmi_timer_setup_secondary_alarm(void)
254{
255 int cpu = smp_processor_id();
256
257 /* Route the interrupt to the correct vector. */
258 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
259
260 per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
261
262 vmi_timer_ops.set_alarm(
263 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
264 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
265 cycles_per_alarm);
266}
267
268#endif
269
270/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
271static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
272{
273 long long cycles_not_accounted;
274
275 write_seqlock(&xtime_lock);
276
277 cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
278 while (cycles_not_accounted >= cycles_per_jiffy) {
279 /* systems wide jiffies and wallclock. */
280 do_timer(1);
281
282 cycles_not_accounted -= cycles_per_jiffy;
283 real_cycles_accounted_system += cycles_per_jiffy;
284 }
285
286 if (vmi_timer_ops.wallclock_updated())
287 update_xtime_from_wallclock();
288
289 write_sequnlock(&xtime_lock);
290}
291
292/* Update per-cpu process times. */
293static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
294 unsigned long long cur_process_times_cycles)
295{
296 long long cycles_not_accounted;
297 cycles_not_accounted = cur_process_times_cycles -
298 per_cpu(process_times_cycles_accounted_cpu, cpu);
299
300 while (cycles_not_accounted >= cycles_per_jiffy) {
301 /* Account time to the current process. This includes
302 * calling into the scheduler to decrement the timeslice
303 * and possibly reschedule.*/
304 update_process_times(user_mode(regs));
305 /* XXX handle /proc/profile multiplier. */
306 profile_tick(CPU_PROFILING);
307
308 cycles_not_accounted -= cycles_per_jiffy;
309 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
310 }
311}
312
313#ifdef CONFIG_NO_IDLE_HZ
314/* Update per-cpu idle times. Used when a no-hz halt is ended. */
315static void vmi_account_no_hz_idle_cycles(int cpu,
316 unsigned long long cur_process_times_cycles)
317{
318 long long cycles_not_accounted;
319 unsigned long no_idle_hz_jiffies = 0;
320
321 cycles_not_accounted = cur_process_times_cycles -
322 per_cpu(process_times_cycles_accounted_cpu, cpu);
323
324 while (cycles_not_accounted >= cycles_per_jiffy) {
325 no_idle_hz_jiffies++;
326 cycles_not_accounted -= cycles_per_jiffy;
327 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
328 }
329 /* Account time to the idle process. */
330 account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
331}
332#endif
333
334/* Update per-cpu stolen time. */
335static void vmi_account_stolen_cycles(int cpu,
336 unsigned long long cur_real_cycles,
337 unsigned long long cur_avail_cycles)
338{
339 long long stolen_cycles_not_accounted;
340 unsigned long stolen_jiffies = 0;
341
342 if (cur_real_cycles < cur_avail_cycles)
343 return;
344
345 stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
346 per_cpu(stolen_cycles_accounted_cpu, cpu);
347
348 while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
349 stolen_jiffies++;
350 stolen_cycles_not_accounted -= cycles_per_jiffy;
351 per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
352 }
353 /* HACK: pass NULL to force time onto cpustat->steal. */
354 account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
355}
356
357/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
358 * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
359static void vmi_local_timer_interrupt(int cpu)
360{
361 unsigned long long cur_real_cycles, cur_process_times_cycles;
362
363 cur_real_cycles = read_real_cycles();
364 cur_process_times_cycles = read_available_cycles();
365 /* Update system wide (real) time state (xtime, jiffies). */
366 vmi_account_real_cycles(cur_real_cycles);
367 /* Update per-cpu process times. */
368 vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
369 /* Update time stolen from this cpu by the hypervisor. */
370 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
371}
372
373#ifdef CONFIG_NO_IDLE_HZ
374
375/* Must be called only from idle loop, with interrupts disabled. */
376int vmi_stop_hz_timer(void)
377{
378 /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
379
380 unsigned long seq, next;
381 unsigned long long real_cycles_expiry;
382 int cpu = smp_processor_id();
383 int idle;
384
385 BUG_ON(!irqs_disabled());
386 if (sysctl_hz_timer != 0)
387 return 0;
388
389 cpu_set(cpu, nohz_cpu_mask);
390 smp_mb();
391 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
392 (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
393 cpu_clear(cpu, nohz_cpu_mask);
394 next = jiffies;
395 idle = 0;
396 } else
397 idle = 1;
398
399 /* Convert jiffies to the real cycle counter. */
400 do {
401 seq = read_seqbegin(&xtime_lock);
402 real_cycles_expiry = real_cycles_accounted_system +
403 (long)(next - jiffies) * cycles_per_jiffy;
404 } while (read_seqretry(&xtime_lock, seq));
405
406 /* This cpu is going idle. Disable the periodic alarm. */
407 if (idle) {
408 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
409 per_cpu(idle_start_jiffies, cpu) = jiffies;
410 }
411
412 /* Set the real time alarm to expire at the next event. */
413 vmi_timer_ops.set_alarm(
414 VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
415 real_cycles_expiry, 0);
416
417 return idle;
418}
419
420static void vmi_reenable_hz_timer(int cpu)
421{
422 /* For /proc/vmi/info idle_hz stat. */
423 per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
424 per_cpu(vmi_idle_no_hz_irqs, cpu)++;
425
426 /* Don't bother explicitly cancelling the one-shot alarm -- at
427 * worse we will receive a spurious timer interrupt. */
428 vmi_timer_ops.set_alarm(
429 VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
430 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
431 cycles_per_alarm);
432 /* Indicate this cpu is no longer nohz idle. */
433 cpu_clear(cpu, nohz_cpu_mask);
434}
435
436/* Called from interrupt handlers when (local) HZ timer is disabled. */
437void vmi_account_time_restart_hz_timer(void)
438{
439 unsigned long long cur_real_cycles, cur_process_times_cycles;
440 int cpu = smp_processor_id();
441
442 BUG_ON(!irqs_disabled());
443 /* Account the time during which the HZ timer was disabled. */
444 cur_real_cycles = read_real_cycles();
445 cur_process_times_cycles = read_available_cycles();
446 /* Update system wide (real) time state (xtime, jiffies). */
447 vmi_account_real_cycles(cur_real_cycles);
448 /* Update per-cpu idle times. */
449 vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
450 /* Update time stolen from this cpu by the hypervisor. */
451 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
452 /* Reenable the hz timer. */
453 vmi_reenable_hz_timer(cpu);
454}
455
456#endif /* CONFIG_NO_IDLE_HZ */
457
458/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
459 * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
460 * APIC setup and setup_boot_vmi_alarm() is called. */
461static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
462{
463 vmi_local_timer_interrupt(smp_processor_id());
464 return IRQ_HANDLED;
465}
466
467#ifdef CONFIG_X86_LOCAL_APIC
468
469/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
470 * Also used in UP when CONFIG_X86_LOCAL_APIC.
471 * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
472void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
473{
474 struct pt_regs *old_regs = set_irq_regs(regs);
475 int cpu = smp_processor_id();
476
477 /*
478 * the NMI deadlock-detector uses this.
479 */
480 per_cpu(irq_stat,cpu).apic_timer_irqs++;
481
482 /*
483 * NOTE! We'd better ACK the irq immediately,
484 * because timer handling can be slow.
485 */
486 ack_APIC_irq();
487
488 /*
489 * update_process_times() expects us to have done irq_enter().
490 * Besides, if we don't timer interrupts ignore the global
491 * interrupt lock, which is the WrongThing (tm) to do.
492 */
493 irq_enter();
494 vmi_local_timer_interrupt(cpu);
495 irq_exit();
496 set_irq_regs(old_regs);
497}
498
499#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 5038a73d554e..ca51610955df 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -37,9 +37,14 @@ SECTIONS
37{ 37{
38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; 38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
39 phys_startup_32 = startup_32 - LOAD_OFFSET; 39 phys_startup_32 = startup_32 - LOAD_OFFSET;
40
41 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
43 *(.text.head)
44 } :text = 0x9090
45
40 /* read-only */ 46 /* read-only */
41 .text : AT(ADDR(.text) - LOAD_OFFSET) { 47 .text : AT(ADDR(.text) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
43 *(.text) 48 *(.text)
44 SCHED_TEXT 49 SCHED_TEXT
45 LOCK_TEXT 50 LOCK_TEXT
diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c
index 9819b705efa4..2e2c51a8bd3a 100644
--- a/arch/i386/math-emu/get_address.c
+++ b/arch/i386/math-emu/get_address.c
@@ -56,15 +56,14 @@ static int reg_offset_vm86[] = {
56#define VM86_REG_(x) (*(unsigned short *) \ 56#define VM86_REG_(x) (*(unsigned short *) \
57 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) 57 (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
58 58
59/* These are dummy, fs and gs are not saved on the stack. */ 59/* This dummy, gs is not saved on the stack. */
60#define ___FS ___ds
61#define ___GS ___ds 60#define ___GS ___ds
62 61
63static int reg_offset_pm[] = { 62static int reg_offset_pm[] = {
64 offsetof(struct info,___cs), 63 offsetof(struct info,___cs),
65 offsetof(struct info,___ds), 64 offsetof(struct info,___ds),
66 offsetof(struct info,___es), 65 offsetof(struct info,___es),
67 offsetof(struct info,___FS), 66 offsetof(struct info,___fs),
68 offsetof(struct info,___GS), 67 offsetof(struct info,___GS),
69 offsetof(struct info,___ss), 68 offsetof(struct info,___ss),
70 offsetof(struct info,___ds) 69 offsetof(struct info,___ds)
@@ -169,13 +168,10 @@ static long pm_address(u_char FPU_modrm, u_char segment,
169 168
170 switch ( segment ) 169 switch ( segment )
171 { 170 {
172 /* fs and gs aren't used by the kernel, so they still have their 171 /* gs isn't used by the kernel, so it still has its
173 user-space values. */ 172 user-space value. */
174 case PREFIX_FS_-1:
175 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
176 savesegment(fs, addr->selector);
177 break;
178 case PREFIX_GS_-1: 173 case PREFIX_GS_-1:
174 /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
179 savesegment(gs, addr->selector); 175 savesegment(gs, addr->selector);
180 break; 176 break;
181 default: 177 default:
diff --git a/arch/i386/math-emu/status_w.h b/arch/i386/math-emu/status_w.h
index 78d7b7689dd6..59e73302aa60 100644
--- a/arch/i386/math-emu/status_w.h
+++ b/arch/i386/math-emu/status_w.h
@@ -48,9 +48,11 @@
48 48
49#define status_word() \ 49#define status_word() \
50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) 50 ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
51#define setcc(cc) ({ \ 51static inline void setcc(int cc)
52 partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); \ 52{
53 partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); }) 53 partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3);
54 partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3);
55}
54 56
55#ifdef PECULIAR_486 57#ifdef PECULIAR_486
56 /* Default, this conveys no information, but an 80486 does it. */ 58 /* Default, this conveys no information, but an 80486 does it. */
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index e0c390d6ceb5..aa58720f6871 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -101,7 +101,6 @@ extern void find_max_pfn(void);
101extern void add_one_highpage_init(struct page *, int, int); 101extern void add_one_highpage_init(struct page *, int, int);
102 102
103extern struct e820map e820; 103extern struct e820map e820;
104extern unsigned long init_pg_tables_end;
105extern unsigned long highend_pfn, highstart_pfn; 104extern unsigned long highend_pfn, highstart_pfn;
106extern unsigned long max_low_pfn; 105extern unsigned long max_low_pfn;
107extern unsigned long totalram_pages; 106extern unsigned long totalram_pages;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index cba9b3894a33..b8c4e259fc8b 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -46,17 +46,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
46} 46}
47EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); 47EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
48 48
49static inline int notify_page_fault(enum die_val val, const char *str, 49static inline int notify_page_fault(struct pt_regs *regs, long err)
50 struct pt_regs *regs, long err, int trap, int sig)
51{ 50{
52 struct die_args args = { 51 struct die_args args = {
53 .regs = regs, 52 .regs = regs,
54 .str = str, 53 .str = "page fault",
55 .err = err, 54 .err = err,
56 .trapnr = trap, 55 .trapnr = 14,
57 .signr = sig 56 .signr = SIGSEGV
58 }; 57 };
59 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args); 58 return atomic_notifier_call_chain(&notify_page_fault_chain,
59 DIE_PAGE_FAULT, &args);
60} 60}
61 61
62/* 62/*
@@ -327,8 +327,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
327 if (unlikely(address >= TASK_SIZE)) { 327 if (unlikely(address >= TASK_SIZE)) {
328 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) 328 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
329 return; 329 return;
330 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 330 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
331 SIGSEGV) == NOTIFY_STOP)
332 return; 331 return;
333 /* 332 /*
334 * Don't take the mm semaphore here. If we fixup a prefetch 333 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -337,8 +336,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
337 goto bad_area_nosemaphore; 336 goto bad_area_nosemaphore;
338 } 337 }
339 338
340 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 339 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
341 SIGSEGV) == NOTIFY_STOP)
342 return; 340 return;
343 341
344 /* It's safe to allow irq's after cr2 has been saved and the vmalloc 342 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index c5c5ea700cc7..ae436882af7a 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
62 62
63#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
64 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 64 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
65 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
65 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 66 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
66 pud = pud_offset(pgd, 0); 67 pud = pud_offset(pgd, 0);
67 if (pmd_table != pmd_offset(pud, 0)) 68 if (pmd_table != pmd_offset(pud, 0))
@@ -82,6 +83,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
82{ 83{
83 if (pmd_none(*pmd)) { 84 if (pmd_none(*pmd)) {
84 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); 85 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
86 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
85 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 87 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
86 if (page_table != pte_offset_kernel(pmd, 0)) 88 if (page_table != pte_offset_kernel(pmd, 0))
87 BUG(); 89 BUG();
@@ -345,6 +347,8 @@ static void __init pagetable_init (void)
345 /* Init entries of the first-level page table to the zero page */ 347 /* Init entries of the first-level page table to the zero page */
346 for (i = 0; i < PTRS_PER_PGD; i++) 348 for (i = 0; i < PTRS_PER_PGD; i++)
347 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); 349 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
350#else
351 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
348#endif 352#endif
349 353
350 /* Enable PSE if available */ 354 /* Enable PSE if available */
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index e223b1d4981c..412ebbd8adb0 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,6 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
60 address = __pa(address); 60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK; 61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base); 62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(page_to_pfn(base));
63 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
64 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, 65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
65 addr == address ? prot : ref_prot)); 66 addr == address ? prot : ref_prot));
@@ -172,6 +173,7 @@ __change_page_attr(struct page *page, pgprot_t prot)
172 if (!PageReserved(kpte_page)) { 173 if (!PageReserved(kpte_page)) {
173 if (cpu_has_pse && (page_private(kpte_page) == 0)) { 174 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
174 ClearPagePrivate(kpte_page); 175 ClearPagePrivate(kpte_page);
176 paravirt_release_pt(page_to_pfn(kpte_page));
175 list_add(&kpte_page->lru, &df_list); 177 list_add(&kpte_page->lru, &df_list);
176 revert_page(kpte_page, address); 178 revert_page(kpte_page, address);
177 } 179 }
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index f349eaf450b0..fa0cfbd551e1 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -171,6 +171,8 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
171void reserve_top_address(unsigned long reserve) 171void reserve_top_address(unsigned long reserve)
172{ 172{
173 BUG_ON(fixmaps > 0); 173 BUG_ON(fixmaps > 0);
174 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
175 (int)-reserve);
174#ifdef CONFIG_COMPAT_VDSO 176#ifdef CONFIG_COMPAT_VDSO
175 BUG_ON(reserve != 0); 177 BUG_ON(reserve != 0);
176#else 178#else
@@ -248,9 +250,15 @@ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
248 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 250 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
249 swapper_pg_dir + USER_PTRS_PER_PGD, 251 swapper_pg_dir + USER_PTRS_PER_PGD,
250 KERNEL_PGD_PTRS); 252 KERNEL_PGD_PTRS);
253
251 if (PTRS_PER_PMD > 1) 254 if (PTRS_PER_PMD > 1)
252 return; 255 return;
253 256
257 /* must happen under lock */
258 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
259 __pa(swapper_pg_dir) >> PAGE_SHIFT,
260 USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
261
254 pgd_list_add(pgd); 262 pgd_list_add(pgd);
255 spin_unlock_irqrestore(&pgd_lock, flags); 263 spin_unlock_irqrestore(&pgd_lock, flags);
256} 264}
@@ -260,6 +268,7 @@ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
260{ 268{
261 unsigned long flags; /* can be called from interrupt context */ 269 unsigned long flags; /* can be called from interrupt context */
262 270
271 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
263 spin_lock_irqsave(&pgd_lock, flags); 272 spin_lock_irqsave(&pgd_lock, flags);
264 pgd_list_del(pgd); 273 pgd_list_del(pgd);
265 spin_unlock_irqrestore(&pgd_lock, flags); 274 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -277,13 +286,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
277 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); 286 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
278 if (!pmd) 287 if (!pmd)
279 goto out_oom; 288 goto out_oom;
289 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
280 set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); 290 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
281 } 291 }
282 return pgd; 292 return pgd;
283 293
284out_oom: 294out_oom:
285 for (i--; i >= 0; i--) 295 for (i--; i >= 0; i--) {
286 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); 296 pgd_t pgdent = pgd[i];
297 void* pmd = (void *)__va(pgd_val(pgdent)-1);
298 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
299 kmem_cache_free(pmd_cache, pmd);
300 }
287 kmem_cache_free(pgd_cache, pgd); 301 kmem_cache_free(pgd_cache, pgd);
288 return NULL; 302 return NULL;
289} 303}
@@ -294,8 +308,12 @@ void pgd_free(pgd_t *pgd)
294 308
295 /* in the PAE case user pgd entries are overwritten before usage */ 309 /* in the PAE case user pgd entries are overwritten before usage */
296 if (PTRS_PER_PMD > 1) 310 if (PTRS_PER_PMD > 1)
297 for (i = 0; i < USER_PTRS_PER_PGD; ++i) 311 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
298 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); 312 pgd_t pgdent = pgd[i];
313 void* pmd = (void *)__va(pgd_val(pgdent)-1);
314 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
315 kmem_cache_free(pmd_cache, pmd);
316 }
299 /* in the non-PAE case, free_pgtables() clears user pgd entries */ 317 /* in the non-PAE case, free_pgtables() clears user pgd entries */
300 kmem_cache_free(pgd_cache, pgd); 318 kmem_cache_free(pgd_cache, pgd);
301} 319}
diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c
index ca2447e05e15..c554f52cb808 100644
--- a/arch/i386/oprofile/op_model_ppro.c
+++ b/arch/i386/oprofile/op_model_ppro.c
@@ -24,7 +24,8 @@
24 24
25#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) 25#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
26#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) 26#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
27#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) 27#define CTR_32BIT_WRITE(l,msrs,c) \
28 do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0)
28#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) 29#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
29 30
30#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) 31#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
@@ -79,7 +80,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
79 for (i = 0; i < NUM_COUNTERS; ++i) { 80 for (i = 0; i < NUM_COUNTERS; ++i) {
80 if (unlikely(!CTR_IS_RESERVED(msrs,i))) 81 if (unlikely(!CTR_IS_RESERVED(msrs,i)))
81 continue; 82 continue;
82 CTR_WRITE(1, msrs, i); 83 CTR_32BIT_WRITE(1, msrs, i);
83 } 84 }
84 85
85 /* enable active counters */ 86 /* enable active counters */
@@ -87,7 +88,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
87 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) { 88 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
88 reset_value[i] = counter_config[i].count; 89 reset_value[i] = counter_config[i].count;
89 90
90 CTR_WRITE(counter_config[i].count, msrs, i); 91 CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
91 92
92 CTRL_READ(low, high, msrs, i); 93 CTRL_READ(low, high, msrs, i);
93 CTRL_CLEAR(low); 94 CTRL_CLEAR(low);
@@ -116,7 +117,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
116 CTR_READ(low, high, msrs, i); 117 CTR_READ(low, high, msrs, i);
117 if (CTR_OVERFLOWED(low)) { 118 if (CTR_OVERFLOWED(low)) {
118 oprofile_add_sample(regs, i); 119 oprofile_add_sample(regs, i);
119 CTR_WRITE(reset_value[i], msrs, i); 120 CTR_32BIT_WRITE(reset_value[i], msrs, i);
120 } 121 }
121 } 122 }
122 123
diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile
index 1594d2f55c8f..44650e03308b 100644
--- a/arch/i386/pci/Makefile
+++ b/arch/i386/pci/Makefile
@@ -1,7 +1,7 @@
1obj-y := i386.o init.o 1obj-y := i386.o init.o
2 2
3obj-$(CONFIG_PCI_BIOS) += pcbios.o 3obj-$(CONFIG_PCI_BIOS) += pcbios.o
4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o 4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6 6
7pci-y := fixup.o 7pci-y := fixup.o
diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c
new file mode 100644
index 000000000000..747d8c63b0c4
--- /dev/null
+++ b/arch/i386/pci/mmconfig-shared.c
@@ -0,0 +1,264 @@
1/*
2 * mmconfig-shared.c - Low-level direct PCI config space access via
3 * MMCONFIG - common code between i386 and x86-64.
4 *
5 * This code does:
6 * - known chipset handling
7 * - ACPI decoding and validation
8 *
9 * Per-architecture code takes care of the mappings and accesses
10 * themselves.
11 */
12
13#include <linux/pci.h>
14#include <linux/init.h>
15#include <linux/acpi.h>
16#include <linux/bitmap.h>
17#include <asm/e820.h>
18
19#include "pci.h"
20
21/* aperture is up to 256MB but BIOS may reserve less */
22#define MMCONFIG_APER_MIN (2 * 1024*1024)
23#define MMCONFIG_APER_MAX (256 * 1024*1024)
24
25DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
26
27/* K8 systems have some devices (typically in the builtin northbridge)
28 that are only accessible using type1
29 Normally this can be expressed in the MCFG by not listing them
30 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
31 Instead try to discover all devices on bus 0 that are unreachable using MM
32 and fallback for them. */
33static void __init unreachable_devices(void)
34{
35 int i, bus;
36 /* Use the max bus number from ACPI here? */
37 for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) {
38 for (i = 0; i < 32; i++) {
39 unsigned int devfn = PCI_DEVFN(i, 0);
40 u32 val1, val2;
41
42 pci_conf1_read(0, bus, devfn, 0, 4, &val1);
43 if (val1 == 0xffffffff)
44 continue;
45
46 if (pci_mmcfg_arch_reachable(0, bus, devfn)) {
47 raw_pci_ops->read(0, bus, devfn, 0, 4, &val2);
48 if (val1 == val2)
49 continue;
50 }
51 set_bit(i + 32 * bus, pci_mmcfg_fallback_slots);
52 printk(KERN_NOTICE "PCI: No mmconfig possible on device"
53 " %02x:%02x\n", bus, i);
54 }
55 }
56}
57
58static const char __init *pci_mmcfg_e7520(void)
59{
60 u32 win;
61 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
62
63 pci_mmcfg_config_num = 1;
64 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
65 if (!pci_mmcfg_config)
66 return NULL;
67 pci_mmcfg_config[0].address = (win & 0xf000) << 16;
68 pci_mmcfg_config[0].pci_segment = 0;
69 pci_mmcfg_config[0].start_bus_number = 0;
70 pci_mmcfg_config[0].end_bus_number = 255;
71
72 return "Intel Corporation E7520 Memory Controller Hub";
73}
74
75static const char __init *pci_mmcfg_intel_945(void)
76{
77 u32 pciexbar, mask = 0, len = 0;
78
79 pci_mmcfg_config_num = 1;
80
81 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar);
82
83 /* Enable bit */
84 if (!(pciexbar & 1))
85 pci_mmcfg_config_num = 0;
86
87 /* Size bits */
88 switch ((pciexbar >> 1) & 3) {
89 case 0:
90 mask = 0xf0000000U;
91 len = 0x10000000U;
92 break;
93 case 1:
94 mask = 0xf8000000U;
95 len = 0x08000000U;
96 break;
97 case 2:
98 mask = 0xfc000000U;
99 len = 0x04000000U;
100 break;
101 default:
102 pci_mmcfg_config_num = 0;
103 }
104
105 /* Errata #2, things break when not aligned on a 256Mb boundary */
106 /* Can only happen in 64M/128M mode */
107
108 if ((pciexbar & mask) & 0x0fffffffU)
109 pci_mmcfg_config_num = 0;
110
111 if (pci_mmcfg_config_num) {
112 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
113 if (!pci_mmcfg_config)
114 return NULL;
115 pci_mmcfg_config[0].address = pciexbar & mask;
116 pci_mmcfg_config[0].pci_segment = 0;
117 pci_mmcfg_config[0].start_bus_number = 0;
118 pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
119 }
120
121 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
122}
123
124struct pci_mmcfg_hostbridge_probe {
125 u32 vendor;
126 u32 device;
127 const char *(*probe)(void);
128};
129
130static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
131 { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
132 { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
133};
134
135static int __init pci_mmcfg_check_hostbridge(void)
136{
137 u32 l;
138 u16 vendor, device;
139 int i;
140 const char *name;
141
142 pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l);
143 vendor = l & 0xffff;
144 device = (l >> 16) & 0xffff;
145
146 pci_mmcfg_config_num = 0;
147 pci_mmcfg_config = NULL;
148 name = NULL;
149
150 for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
151 if (pci_mmcfg_probes[i].vendor == vendor &&
152 pci_mmcfg_probes[i].device == device)
153 name = pci_mmcfg_probes[i].probe();
154 }
155
156 if (name) {
157 printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n",
158 name, pci_mmcfg_config_num ? "with" : "without");
159 }
160
161 return name != NULL;
162}
163
164static void __init pci_mmcfg_insert_resources(void)
165{
166#define PCI_MMCFG_RESOURCE_NAME_LEN 19
167 int i;
168 struct resource *res;
169 char *names;
170 unsigned num_buses;
171
172 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
173 pci_mmcfg_config_num, GFP_KERNEL);
174 if (!res) {
175 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
176 return;
177 }
178
179 names = (void *)&res[pci_mmcfg_config_num];
180 for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
181 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
182 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
183 res->name = names;
184 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
185 cfg->pci_segment);
186 res->start = cfg->address;
187 res->end = res->start + (num_buses << 20) - 1;
188 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
189 insert_resource(&iomem_resource, res);
190 names += PCI_MMCFG_RESOURCE_NAME_LEN;
191 }
192}
193
194static void __init pci_mmcfg_reject_broken(int type)
195{
196 typeof(pci_mmcfg_config[0]) *cfg;
197
198 if ((pci_mmcfg_config_num == 0) ||
199 (pci_mmcfg_config == NULL) ||
200 (pci_mmcfg_config[0].address == 0))
201 return;
202
203 cfg = &pci_mmcfg_config[0];
204
205 /*
206 * Handle more broken MCFG tables on Asus etc.
207 * They only contain a single entry for bus 0-0.
208 */
209 if (pci_mmcfg_config_num == 1 &&
210 cfg->pci_segment == 0 &&
211 (cfg->start_bus_number | cfg->end_bus_number) == 0) {
212 printk(KERN_ERR "PCI: start and end of bus number is 0. "
213 "Rejected as broken MCFG.\n");
214 goto reject;
215 }
216
217 /*
218 * Only do this check when type 1 works. If it doesn't work
219 * assume we run on a Mac and always use MCFG
220 */
221 if (type == 1 && !e820_all_mapped(cfg->address,
222 cfg->address + MMCONFIG_APER_MIN,
223 E820_RESERVED)) {
224 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
225 " E820-reserved\n", cfg->address);
226 goto reject;
227 }
228 return;
229
230reject:
231 printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
232 kfree(pci_mmcfg_config);
233 pci_mmcfg_config = NULL;
234 pci_mmcfg_config_num = 0;
235}
236
237void __init pci_mmcfg_init(int type)
238{
239 int known_bridge = 0;
240
241 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
242 return;
243
244 if (type == 1 && pci_mmcfg_check_hostbridge())
245 known_bridge = 1;
246
247 if (!known_bridge) {
248 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
249 pci_mmcfg_reject_broken(type);
250 }
251
252 if ((pci_mmcfg_config_num == 0) ||
253 (pci_mmcfg_config == NULL) ||
254 (pci_mmcfg_config[0].address == 0))
255 return;
256
257 if (pci_mmcfg_arch_init()) {
258 if (type == 1)
259 unreachable_devices();
260 if (known_bridge)
261 pci_mmcfg_insert_resources();
262 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
263 }
264}
diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c
index 5700220dcf5f..bb1afd9e589d 100644
--- a/arch/i386/pci/mmconfig.c
+++ b/arch/i386/pci/mmconfig.c
@@ -15,55 +15,33 @@
15#include <asm/e820.h> 15#include <asm/e820.h>
16#include "pci.h" 16#include "pci.h"
17 17
18/* aperture is up to 256MB but BIOS may reserve less */
19#define MMCONFIG_APER_MIN (2 * 1024*1024)
20#define MMCONFIG_APER_MAX (256 * 1024*1024)
21
22/* Assume systems with more busses have correct MCFG */ 18/* Assume systems with more busses have correct MCFG */
23#define MAX_CHECK_BUS 16
24
25#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) 19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
26 20
27/* The base address of the last MMCONFIG device accessed */ 21/* The base address of the last MMCONFIG device accessed */
28static u32 mmcfg_last_accessed_device; 22static u32 mmcfg_last_accessed_device;
29static int mmcfg_last_accessed_cpu; 23static int mmcfg_last_accessed_cpu;
30 24
31static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32);
32
33/* 25/*
34 * Functions for accessing PCI configuration space with MMCONFIG accesses 26 * Functions for accessing PCI configuration space with MMCONFIG accesses
35 */ 27 */
36static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) 28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
37{ 29{
38 int cfg_num = -1;
39 struct acpi_mcfg_allocation *cfg; 30 struct acpi_mcfg_allocation *cfg;
31 int cfg_num;
40 32
41 if (seg == 0 && bus < MAX_CHECK_BUS && 33 if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
42 test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots)) 34 test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots))
43 return 0; 35 return 0;
44 36
45 while (1) { 37 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
46 ++cfg_num;
47 if (cfg_num >= pci_mmcfg_config_num) {
48 break;
49 }
50 cfg = &pci_mmcfg_config[cfg_num]; 38 cfg = &pci_mmcfg_config[cfg_num];
51 if (cfg->pci_segment != seg) 39 if (cfg->pci_segment == seg &&
52 continue; 40 (cfg->start_bus_number <= bus) &&
53 if ((cfg->start_bus_number <= bus) &&
54 (cfg->end_bus_number >= bus)) 41 (cfg->end_bus_number >= bus))
55 return cfg->address; 42 return cfg->address;
56 } 43 }
57 44
58 /* Handle more broken MCFG tables on Asus etc.
59 They only contain a single entry for bus 0-0. Assume
60 this applies to all busses. */
61 cfg = &pci_mmcfg_config[0];
62 if (pci_mmcfg_config_num == 1 &&
63 cfg->pci_segment == 0 &&
64 (cfg->start_bus_number | cfg->end_bus_number) == 0)
65 return cfg->address;
66
67 /* Fall back to type 0 */ 45 /* Fall back to type 0 */
68 return 0; 46 return 0;
69} 47}
@@ -158,67 +136,15 @@ static struct pci_raw_ops pci_mmcfg = {
158 .write = pci_mmcfg_write, 136 .write = pci_mmcfg_write,
159}; 137};
160 138
161/* K8 systems have some devices (typically in the builtin northbridge) 139int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
162 that are only accessible using type1 140 unsigned int devfn)
163 Normally this can be expressed in the MCFG by not listing them
164 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
165 Instead try to discover all devices on bus 0 that are unreachable using MM
166 and fallback for them. */
167static __init void unreachable_devices(void)
168{ 141{
169 int i, k; 142 return get_base_addr(seg, bus, devfn) != 0;
170 unsigned long flags;
171
172 for (k = 0; k < MAX_CHECK_BUS; k++) {
173 for (i = 0; i < 32; i++) {
174 u32 val1;
175 u32 addr;
176
177 pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1);
178 if (val1 == 0xffffffff)
179 continue;
180
181 /* Locking probably not needed, but safer */
182 spin_lock_irqsave(&pci_config_lock, flags);
183 addr = get_base_addr(0, k, PCI_DEVFN(i, 0));
184 if (addr != 0)
185 pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0));
186 if (addr == 0 ||
187 readl((u32 __iomem *)mmcfg_virt_addr) != val1) {
188 set_bit(i + 32*k, fallback_slots);
189 printk(KERN_NOTICE
190 "PCI: No mmconfig possible on %x:%x\n", k, i);
191 }
192 spin_unlock_irqrestore(&pci_config_lock, flags);
193 }
194 }
195} 143}
196 144
197void __init pci_mmcfg_init(int type) 145int __init pci_mmcfg_arch_init(void)
198{ 146{
199 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
200 return;
201
202 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
203 if ((pci_mmcfg_config_num == 0) ||
204 (pci_mmcfg_config == NULL) ||
205 (pci_mmcfg_config[0].address == 0))
206 return;
207
208 /* Only do this check when type 1 works. If it doesn't work
209 assume we run on a Mac and always use MCFG */
210 if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
211 pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
212 E820_RESERVED)) {
213 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
214 (unsigned long)pci_mmcfg_config[0].address);
215 printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
216 return;
217 }
218
219 printk(KERN_INFO "PCI: Using MMCONFIG\n"); 147 printk(KERN_INFO "PCI: Using MMCONFIG\n");
220 raw_pci_ops = &pci_mmcfg; 148 raw_pci_ops = &pci_mmcfg;
221 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 149 return 1;
222
223 unreachable_devices();
224} 150}
diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h
index a0a25180b61a..e58bae2076ad 100644
--- a/arch/i386/pci/pci.h
+++ b/arch/i386/pci/pci.h
@@ -94,3 +94,13 @@ extern void pci_pcbios_init(void);
94extern void pci_mmcfg_init(int type); 94extern void pci_mmcfg_init(int type);
95extern void pcibios_sort(void); 95extern void pcibios_sort(void);
96 96
97/* pci-mmconfig.c */
98
99/* Verify the first 16 busses. We assume that systems with more busses
100 get MCFG right. */
101#define PCI_MMCFG_MAX_CHECK_BUS 16
102extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
103
104extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
105 unsigned int devfn);
106extern int __init pci_mmcfg_arch_init(void);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 02dd39457bcf..7982cbc3bc94 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -152,18 +152,18 @@ config MPSC
152 Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs 152 Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs
153 with Intel Extended Memory 64 Technology(EM64T). For details see 153 with Intel Extended Memory 64 Technology(EM64T). For details see
154 <http://www.intel.com/technology/64bitextensions/>. 154 <http://www.intel.com/technology/64bitextensions/>.
155 Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the 155 Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
156 Netburst core and shouldn't use this option. You can distingush them 156 Netburst core and shouldn't use this option. You can distinguish them
157 using the cpu family field 157 using the cpu family field
158 in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one 158 in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one
159 (this rule only applies to system that support EM64T) 159 (this rule only applies to systems that support EM64T)
160 160
161config MCORE2 161config MCORE2
162 bool "Intel Core2 / newer Xeon" 162 bool "Intel Core2 / newer Xeon"
163 help 163 help
164 Optimize for Intel Core2 and newer Xeons (51xx) 164 Optimize for Intel Core2 and newer Xeons (51xx)
165 You can distingush the newer Xeons from the older ones using 165 You can distinguish the newer Xeons from the older ones using
166 the cpu family field in /proc/cpuinfo. 15 is a older Xeon 166 the cpu family field in /proc/cpuinfo. 15 is an older Xeon
167 (use CONFIG_MPSC then), 6 is a newer one. This rule only 167 (use CONFIG_MPSC then), 6 is a newer one. This rule only
168 applies to CPUs that support EM64T. 168 applies to CPUs that support EM64T.
169 169
@@ -458,8 +458,8 @@ config IOMMU
458 on systems with more than 3GB. This is usually needed for USB, 458 on systems with more than 3GB. This is usually needed for USB,
459 sound, many IDE/SATA chipsets and some other devices. 459 sound, many IDE/SATA chipsets and some other devices.
460 Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART 460 Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
461 based IOMMU and a software bounce buffer based IOMMU used on Intel 461 based hardware IOMMU and a software bounce buffer based IOMMU used
462 systems and as fallback. 462 on Intel systems and as fallback.
463 The code is only active when needed (enough memory and limited 463 The code is only active when needed (enough memory and limited
464 device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified 464 device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
465 too. 465 too.
@@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
496# need this always selected by IOMMU for the VIA workaround 496# need this always selected by IOMMU for the VIA workaround
497config SWIOTLB 497config SWIOTLB
498 bool 498 bool
499 help
500 Support for software bounce buffers used on x86-64 systems
501 which don't have a hardware IOMMU (e.g. the current generation
502 of Intel's x86-64 CPUs). Using this PCI devices which can only
503 access 32-bits of memory can be used on systems with more than
504 3 GB of memory. If unsure, say Y.
499 505
500config X86_MCE 506config X86_MCE
501 bool "Machine check support" if EMBEDDED 507 bool "Machine check support" if EMBEDDED
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 69584c295305..293a4a4c609e 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.20-rc3 3# Linux kernel version: 2.6.20-git8
4# Fri Jan 5 11:54:41 2007 4# Tue Feb 13 11:25:16 2007
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -11,6 +11,7 @@ CONFIG_LOCKDEP_SUPPORT=y
11CONFIG_STACKTRACE_SUPPORT=y 11CONFIG_STACKTRACE_SUPPORT=y
12CONFIG_SEMAPHORE_SLEEPERS=y 12CONFIG_SEMAPHORE_SLEEPERS=y
13CONFIG_MMU=y 13CONFIG_MMU=y
14CONFIG_ZONE_DMA=y
14CONFIG_RWSEM_GENERIC_SPINLOCK=y 15CONFIG_RWSEM_GENERIC_SPINLOCK=y
15CONFIG_GENERIC_HWEIGHT=y 16CONFIG_GENERIC_HWEIGHT=y
16CONFIG_GENERIC_CALIBRATE_DELAY=y 17CONFIG_GENERIC_CALIBRATE_DELAY=y
@@ -153,6 +154,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
153CONFIG_SPLIT_PTLOCK_CPUS=4 154CONFIG_SPLIT_PTLOCK_CPUS=4
154CONFIG_MIGRATION=y 155CONFIG_MIGRATION=y
155CONFIG_RESOURCES_64BIT=y 156CONFIG_RESOURCES_64BIT=y
157CONFIG_ZONE_DMA_FLAG=1
156CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y 158CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
157CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y 159CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
158CONFIG_NR_CPUS=32 160CONFIG_NR_CPUS=32
@@ -201,13 +203,14 @@ CONFIG_ACPI=y
201CONFIG_ACPI_SLEEP=y 203CONFIG_ACPI_SLEEP=y
202CONFIG_ACPI_SLEEP_PROC_FS=y 204CONFIG_ACPI_SLEEP_PROC_FS=y
203CONFIG_ACPI_SLEEP_PROC_SLEEP=y 205CONFIG_ACPI_SLEEP_PROC_SLEEP=y
206CONFIG_ACPI_PROCFS=y
204CONFIG_ACPI_AC=y 207CONFIG_ACPI_AC=y
205CONFIG_ACPI_BATTERY=y 208CONFIG_ACPI_BATTERY=y
206CONFIG_ACPI_BUTTON=y 209CONFIG_ACPI_BUTTON=y
207# CONFIG_ACPI_VIDEO is not set
208# CONFIG_ACPI_HOTKEY is not set 210# CONFIG_ACPI_HOTKEY is not set
209CONFIG_ACPI_FAN=y 211CONFIG_ACPI_FAN=y
210# CONFIG_ACPI_DOCK is not set 212# CONFIG_ACPI_DOCK is not set
213# CONFIG_ACPI_BAY is not set
211CONFIG_ACPI_PROCESSOR=y 214CONFIG_ACPI_PROCESSOR=y
212CONFIG_ACPI_HOTPLUG_CPU=y 215CONFIG_ACPI_HOTPLUG_CPU=y
213CONFIG_ACPI_THERMAL=y 216CONFIG_ACPI_THERMAL=y
@@ -263,7 +266,6 @@ CONFIG_PCI_MMCONFIG=y
263CONFIG_PCIEPORTBUS=y 266CONFIG_PCIEPORTBUS=y
264CONFIG_PCIEAER=y 267CONFIG_PCIEAER=y
265CONFIG_PCI_MSI=y 268CONFIG_PCI_MSI=y
266# CONFIG_PCI_MULTITHREAD_PROBE is not set
267# CONFIG_PCI_DEBUG is not set 269# CONFIG_PCI_DEBUG is not set
268# CONFIG_HT_IRQ is not set 270# CONFIG_HT_IRQ is not set
269 271
@@ -398,6 +400,7 @@ CONFIG_STANDALONE=y
398CONFIG_PREVENT_FIRMWARE_BUILD=y 400CONFIG_PREVENT_FIRMWARE_BUILD=y
399CONFIG_FW_LOADER=y 401CONFIG_FW_LOADER=y
400# CONFIG_DEBUG_DRIVER is not set 402# CONFIG_DEBUG_DRIVER is not set
403# CONFIG_DEBUG_DEVRES is not set
401# CONFIG_SYS_HYPERVISOR is not set 404# CONFIG_SYS_HYPERVISOR is not set
402 405
403# 406#
@@ -466,6 +469,7 @@ CONFIG_BLK_DEV_IDECD=y
466# CONFIG_BLK_DEV_IDETAPE is not set 469# CONFIG_BLK_DEV_IDETAPE is not set
467# CONFIG_BLK_DEV_IDEFLOPPY is not set 470# CONFIG_BLK_DEV_IDEFLOPPY is not set
468# CONFIG_BLK_DEV_IDESCSI is not set 471# CONFIG_BLK_DEV_IDESCSI is not set
472CONFIG_BLK_DEV_IDEACPI=y
469# CONFIG_IDE_TASK_IOCTL is not set 473# CONFIG_IDE_TASK_IOCTL is not set
470 474
471# 475#
@@ -497,6 +501,7 @@ CONFIG_BLK_DEV_ATIIXP=y
497# CONFIG_BLK_DEV_JMICRON is not set 501# CONFIG_BLK_DEV_JMICRON is not set
498# CONFIG_BLK_DEV_SC1200 is not set 502# CONFIG_BLK_DEV_SC1200 is not set
499CONFIG_BLK_DEV_PIIX=y 503CONFIG_BLK_DEV_PIIX=y
504# CONFIG_BLK_DEV_IT8213 is not set
500# CONFIG_BLK_DEV_IT821X is not set 505# CONFIG_BLK_DEV_IT821X is not set
501# CONFIG_BLK_DEV_NS87415 is not set 506# CONFIG_BLK_DEV_NS87415 is not set
502# CONFIG_BLK_DEV_PDC202XX_OLD is not set 507# CONFIG_BLK_DEV_PDC202XX_OLD is not set
@@ -507,6 +512,7 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y
507# CONFIG_BLK_DEV_SLC90E66 is not set 512# CONFIG_BLK_DEV_SLC90E66 is not set
508# CONFIG_BLK_DEV_TRM290 is not set 513# CONFIG_BLK_DEV_TRM290 is not set
509# CONFIG_BLK_DEV_VIA82CXXX is not set 514# CONFIG_BLK_DEV_VIA82CXXX is not set
515# CONFIG_BLK_DEV_TC86C001 is not set
510# CONFIG_IDE_ARM is not set 516# CONFIG_IDE_ARM is not set
511CONFIG_BLK_DEV_IDEDMA=y 517CONFIG_BLK_DEV_IDEDMA=y
512# CONFIG_IDEDMA_IVB is not set 518# CONFIG_IDEDMA_IVB is not set
@@ -599,6 +605,7 @@ CONFIG_MEGARAID_SAS=y
599# Serial ATA (prod) and Parallel ATA (experimental) drivers 605# Serial ATA (prod) and Parallel ATA (experimental) drivers
600# 606#
601CONFIG_ATA=y 607CONFIG_ATA=y
608# CONFIG_ATA_NONSTANDARD is not set
602CONFIG_SATA_AHCI=y 609CONFIG_SATA_AHCI=y
603CONFIG_SATA_SVW=y 610CONFIG_SATA_SVW=y
604CONFIG_ATA_PIIX=y 611CONFIG_ATA_PIIX=y
@@ -614,6 +621,7 @@ CONFIG_SATA_SIL=y
614# CONFIG_SATA_ULI is not set 621# CONFIG_SATA_ULI is not set
615CONFIG_SATA_VIA=y 622CONFIG_SATA_VIA=y
616# CONFIG_SATA_VITESSE is not set 623# CONFIG_SATA_VITESSE is not set
624# CONFIG_SATA_INIC162X is not set
617CONFIG_SATA_INTEL_COMBINED=y 625CONFIG_SATA_INTEL_COMBINED=y
618# CONFIG_PATA_ALI is not set 626# CONFIG_PATA_ALI is not set
619# CONFIG_PATA_AMD is not set 627# CONFIG_PATA_AMD is not set
@@ -630,6 +638,7 @@ CONFIG_SATA_INTEL_COMBINED=y
630# CONFIG_PATA_HPT3X2N is not set 638# CONFIG_PATA_HPT3X2N is not set
631# CONFIG_PATA_HPT3X3 is not set 639# CONFIG_PATA_HPT3X3 is not set
632# CONFIG_PATA_IT821X is not set 640# CONFIG_PATA_IT821X is not set
641# CONFIG_PATA_IT8213 is not set
633# CONFIG_PATA_JMICRON is not set 642# CONFIG_PATA_JMICRON is not set
634# CONFIG_PATA_TRIFLEX is not set 643# CONFIG_PATA_TRIFLEX is not set
635# CONFIG_PATA_MARVELL is not set 644# CONFIG_PATA_MARVELL is not set
@@ -682,9 +691,7 @@ CONFIG_IEEE1394=y
682# Subsystem Options 691# Subsystem Options
683# 692#
684# CONFIG_IEEE1394_VERBOSEDEBUG is not set 693# CONFIG_IEEE1394_VERBOSEDEBUG is not set
685# CONFIG_IEEE1394_OUI_DB is not set
686# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set 694# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
687# CONFIG_IEEE1394_EXPORT_FULL_API is not set
688 695
689# 696#
690# Device Drivers 697# Device Drivers
@@ -707,6 +714,11 @@ CONFIG_IEEE1394_RAWIO=y
707# CONFIG_I2O is not set 714# CONFIG_I2O is not set
708 715
709# 716#
717# Macintosh device drivers
718#
719# CONFIG_MAC_EMUMOUSEBTN is not set
720
721#
710# Network device support 722# Network device support
711# 723#
712CONFIG_NETDEVICES=y 724CONFIG_NETDEVICES=y
@@ -774,6 +786,7 @@ CONFIG_8139TOO=y
774# CONFIG_EPIC100 is not set 786# CONFIG_EPIC100 is not set
775# CONFIG_SUNDANCE is not set 787# CONFIG_SUNDANCE is not set
776# CONFIG_VIA_RHINE is not set 788# CONFIG_VIA_RHINE is not set
789# CONFIG_SC92031 is not set
777 790
778# 791#
779# Ethernet (1000 Mbit) 792# Ethernet (1000 Mbit)
@@ -795,11 +808,13 @@ CONFIG_E1000=y
795CONFIG_TIGON3=y 808CONFIG_TIGON3=y
796CONFIG_BNX2=y 809CONFIG_BNX2=y
797# CONFIG_QLA3XXX is not set 810# CONFIG_QLA3XXX is not set
811# CONFIG_ATL1 is not set
798 812
799# 813#
800# Ethernet (10000 Mbit) 814# Ethernet (10000 Mbit)
801# 815#
802# CONFIG_CHELSIO_T1 is not set 816# CONFIG_CHELSIO_T1 is not set
817# CONFIG_CHELSIO_T3 is not set
803# CONFIG_IXGB is not set 818# CONFIG_IXGB is not set
804CONFIG_S2IO=m 819CONFIG_S2IO=m
805# CONFIG_S2IO_NAPI is not set 820# CONFIG_S2IO_NAPI is not set
@@ -1115,6 +1130,7 @@ CONFIG_SOUND=y
1115# Open Sound System 1130# Open Sound System
1116# 1131#
1117CONFIG_SOUND_PRIME=y 1132CONFIG_SOUND_PRIME=y
1133CONFIG_OBSOLETE_OSS=y
1118# CONFIG_SOUND_BT878 is not set 1134# CONFIG_SOUND_BT878 is not set
1119# CONFIG_SOUND_ES1371 is not set 1135# CONFIG_SOUND_ES1371 is not set
1120CONFIG_SOUND_ICH=y 1136CONFIG_SOUND_ICH=y
@@ -1128,6 +1144,7 @@ CONFIG_SOUND_ICH=y
1128# HID Devices 1144# HID Devices
1129# 1145#
1130CONFIG_HID=y 1146CONFIG_HID=y
1147# CONFIG_HID_DEBUG is not set
1131 1148
1132# 1149#
1133# USB support 1150# USB support
@@ -1142,10 +1159,8 @@ CONFIG_USB=y
1142# Miscellaneous USB options 1159# Miscellaneous USB options
1143# 1160#
1144CONFIG_USB_DEVICEFS=y 1161CONFIG_USB_DEVICEFS=y
1145# CONFIG_USB_BANDWIDTH is not set
1146# CONFIG_USB_DYNAMIC_MINORS is not set 1162# CONFIG_USB_DYNAMIC_MINORS is not set
1147# CONFIG_USB_SUSPEND is not set 1163# CONFIG_USB_SUSPEND is not set
1148# CONFIG_USB_MULTITHREAD_PROBE is not set
1149# CONFIG_USB_OTG is not set 1164# CONFIG_USB_OTG is not set
1150 1165
1151# 1166#
@@ -1155,9 +1170,11 @@ CONFIG_USB_EHCI_HCD=y
1155# CONFIG_USB_EHCI_SPLIT_ISO is not set 1170# CONFIG_USB_EHCI_SPLIT_ISO is not set
1156# CONFIG_USB_EHCI_ROOT_HUB_TT is not set 1171# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
1157# CONFIG_USB_EHCI_TT_NEWSCHED is not set 1172# CONFIG_USB_EHCI_TT_NEWSCHED is not set
1173# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
1158# CONFIG_USB_ISP116X_HCD is not set 1174# CONFIG_USB_ISP116X_HCD is not set
1159CONFIG_USB_OHCI_HCD=y 1175CONFIG_USB_OHCI_HCD=y
1160# CONFIG_USB_OHCI_BIG_ENDIAN is not set 1176# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
1177# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
1161CONFIG_USB_OHCI_LITTLE_ENDIAN=y 1178CONFIG_USB_OHCI_LITTLE_ENDIAN=y
1162CONFIG_USB_UHCI_HCD=y 1179CONFIG_USB_UHCI_HCD=y
1163# CONFIG_USB_SL811_HCD is not set 1180# CONFIG_USB_SL811_HCD is not set
@@ -1208,6 +1225,7 @@ CONFIG_USB_HID=y
1208# CONFIG_USB_ATI_REMOTE2 is not set 1225# CONFIG_USB_ATI_REMOTE2 is not set
1209# CONFIG_USB_KEYSPAN_REMOTE is not set 1226# CONFIG_USB_KEYSPAN_REMOTE is not set
1210# CONFIG_USB_APPLETOUCH is not set 1227# CONFIG_USB_APPLETOUCH is not set
1228# CONFIG_USB_GTCO is not set
1211 1229
1212# 1230#
1213# USB Imaging devices 1231# USB Imaging devices
@@ -1313,6 +1331,10 @@ CONFIG_USB_MON=y
1313# 1331#
1314 1332
1315# 1333#
1334# Auxiliary Display support
1335#
1336
1337#
1316# Virtualization 1338# Virtualization
1317# 1339#
1318# CONFIG_KVM is not set 1340# CONFIG_KVM is not set
@@ -1512,6 +1534,7 @@ CONFIG_UNUSED_SYMBOLS=y
1512CONFIG_DEBUG_FS=y 1534CONFIG_DEBUG_FS=y
1513# CONFIG_HEADERS_CHECK is not set 1535# CONFIG_HEADERS_CHECK is not set
1514CONFIG_DEBUG_KERNEL=y 1536CONFIG_DEBUG_KERNEL=y
1537# CONFIG_DEBUG_SHIRQ is not set
1515CONFIG_LOG_BUF_SHIFT=18 1538CONFIG_LOG_BUF_SHIFT=18
1516CONFIG_DETECT_SOFTLOCKUP=y 1539CONFIG_DETECT_SOFTLOCKUP=y
1517# CONFIG_SCHEDSTATS is not set 1540# CONFIG_SCHEDSTATS is not set
@@ -1520,7 +1543,6 @@ CONFIG_DETECT_SOFTLOCKUP=y
1520# CONFIG_RT_MUTEX_TESTER is not set 1543# CONFIG_RT_MUTEX_TESTER is not set
1521# CONFIG_DEBUG_SPINLOCK is not set 1544# CONFIG_DEBUG_SPINLOCK is not set
1522# CONFIG_DEBUG_MUTEXES is not set 1545# CONFIG_DEBUG_MUTEXES is not set
1523# CONFIG_DEBUG_RWSEMS is not set
1524# CONFIG_DEBUG_LOCK_ALLOC is not set 1546# CONFIG_DEBUG_LOCK_ALLOC is not set
1525# CONFIG_PROVE_LOCKING is not set 1547# CONFIG_PROVE_LOCKING is not set
1526# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 1548# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
@@ -1560,4 +1582,5 @@ CONFIG_CRC32=y
1560# CONFIG_LIBCRC32C is not set 1582# CONFIG_LIBCRC32C is not set
1561CONFIG_ZLIB_INFLATE=y 1583CONFIG_ZLIB_INFLATE=y
1562CONFIG_PLIST=y 1584CONFIG_PLIST=y
1563CONFIG_IOMAP_COPY=y 1585CONFIG_HAS_IOMEM=y
1586CONFIG_HAS_IOPORT=y
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index ff499ef2a1ba..359eacc38509 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -21,6 +21,7 @@
21#include <linux/stddef.h> 21#include <linux/stddef.h>
22#include <linux/personality.h> 22#include <linux/personality.h>
23#include <linux/compat.h> 23#include <linux/compat.h>
24#include <linux/binfmts.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/i387.h> 27#include <asm/i387.h>
@@ -449,7 +450,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
449 450
450 /* Return stub is in 32bit vsyscall page */ 451 /* Return stub is in 32bit vsyscall page */
451 { 452 {
452 void __user *restorer = VSYSCALL32_SIGRETURN; 453 void __user *restorer;
454 if (current->binfmt->hasvdso)
455 restorer = VSYSCALL32_SIGRETURN;
456 else
457 restorer = (void *)&frame->retcode;
453 if (ka->sa.sa_flags & SA_RESTORER) 458 if (ka->sa.sa_flags & SA_RESTORER)
454 restorer = ka->sa.sa_restorer; 459 restorer = ka->sa.sa_restorer;
455 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); 460 err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
@@ -495,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
495 ptrace_notify(SIGTRAP); 500 ptrace_notify(SIGTRAP);
496 501
497#if DEBUG_SIG 502#if DEBUG_SIG
498 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 503 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
499 current->comm, current->pid, frame, regs->rip, frame->pretcode); 504 current->comm, current->pid, frame, regs->rip, frame->pretcode);
500#endif 505#endif
501 506
@@ -601,7 +606,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
601 ptrace_notify(SIGTRAP); 606 ptrace_notify(SIGTRAP);
602 607
603#if DEBUG_SIG 608#if DEBUG_SIG
604 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 609 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
605 current->comm, current->pid, frame, regs->rip, frame->pretcode); 610 current->comm, current->pid, frame, regs->rip, frame->pretcode);
606#endif 611#endif
607 612
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5f32cf4de5fb..eda7a0d4dc15 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -718,4 +718,5 @@ ia32_sys_call_table:
718 .quad compat_sys_vmsplice 718 .quad compat_sys_vmsplice
719 .quad compat_sys_move_pages 719 .quad compat_sys_move_pages
720 .quad sys_getcpu 720 .quad sys_getcpu
721 .quad sys_epoll_pwait
721ia32_syscall_end: 722ia32_syscall_end:
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 3c7cbff04d3d..ae399458024b 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o
43 43
44obj-y += topology.o 44obj-y += topology.o
45obj-y += intel_cacheinfo.o 45obj-y += intel_cacheinfo.o
46obj-y += pcspeaker.o
46 47
47CFLAGS_vsyscall.o := $(PROFILING) -g0 48CFLAGS_vsyscall.o := $(PROFILING) -g0
48 49
@@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o
56i8237-y += ../../i386/kernel/i8237.o 57i8237-y += ../../i386/kernel/i8237.o
57msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o 58msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
58alternative-y += ../../i386/kernel/alternative.o 59alternative-y += ../../i386/kernel/alternative.o
60pcspeaker-y += ../../i386/kernel/pcspeaker.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 5ebf62c7a3d2..23178ce6c783 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0;
58unsigned long acpi_video_flags; 58unsigned long acpi_video_flags;
59extern char wakeup_start, wakeup_end; 59extern char wakeup_start, wakeup_end;
60 60
61extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); 61extern unsigned long acpi_copy_wakeup_routine(unsigned long);
62 62
63static pgd_t low_ptr; 63static pgd_t low_ptr;
64 64
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 6fe191c58084..4651fd22b213 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
83 return 1; 83 return 1;
84 } 84 }
85 85
86#ifdef CONFIG_NUMA
87 /* NUMA memory to node map */
88 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
89 *addrp = nodemap_addr + nodemap_size;
90 return 1;
91 }
92#endif
86 /* XXX ramdisk image here? */ 93 /* XXX ramdisk image here? */
87 return 0; 94 return 0;
88} 95}
@@ -184,6 +191,37 @@ unsigned long __init e820_end_of_ram(void)
184} 191}
185 192
186/* 193/*
194 * Find the hole size in the range.
195 */
196unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
197{
198 unsigned long ram = 0;
199 int i;
200
201 for (i = 0; i < e820.nr_map; i++) {
202 struct e820entry *ei = &e820.map[i];
203 unsigned long last, addr;
204
205 if (ei->type != E820_RAM ||
206 ei->addr+ei->size <= start ||
207 ei->addr >= end)
208 continue;
209
210 addr = round_up(ei->addr, PAGE_SIZE);
211 if (addr < start)
212 addr = start;
213
214 last = round_down(ei->addr + ei->size, PAGE_SIZE);
215 if (last >= end)
216 last = end;
217
218 if (last > addr)
219 ram += last - addr;
220 }
221 return ((end - start) - ram);
222}
223
224/*
187 * Mark e820 reserved areas as busy for the resource manager. 225 * Mark e820 reserved areas as busy for the resource manager.
188 */ 226 */
189void __init e820_reserve_resources(void) 227void __init e820_reserve_resources(void)
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 1e6f80870679..598a4d0351fc 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -163,6 +163,20 @@ startup_64:
163 */ 163 */
164 lgdt cpu_gdt_descr 164 lgdt cpu_gdt_descr
165 165
166 /* set up data segments. actually 0 would do too */
167 movl $__KERNEL_DS,%eax
168 movl %eax,%ds
169 movl %eax,%ss
170 movl %eax,%es
171
172 /*
173 * We don't really need to load %fs or %gs, but load them anyway
174 * to kill any stale realmode selectors. This allows execution
175 * under VT hardware.
176 */
177 movl %eax,%fs
178 movl %eax,%gs
179
166 /* 180 /*
167 * Setup up a dummy PDA. this is just for some early bootup code 181 * Setup up a dummy PDA. this is just for some early bootup code
168 * that does in_interrupt() 182 * that does in_interrupt()
@@ -173,12 +187,6 @@ startup_64:
173 shrq $32,%rdx 187 shrq $32,%rdx
174 wrmsr 188 wrmsr
175 189
176 /* set up data segments. actually 0 would do too */
177 movl $__KERNEL_DS,%eax
178 movl %eax,%ds
179 movl %eax,%ss
180 movl %eax,%es
181
182 /* esi is pointer to real mode structure with interesting info. 190 /* esi is pointer to real mode structure with interesting info.
183 pass it to C */ 191 pass it to C */
184 movl %esi, %edi 192 movl %esi, %edi
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 6be6730acb5c..566e64d966c4 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
831 entry.delivery_mode = INT_DELIVERY_MODE; 831 entry.delivery_mode = INT_DELIVERY_MODE;
832 entry.dest_mode = INT_DEST_MODE; 832 entry.dest_mode = INT_DEST_MODE;
833 entry.mask = 0; /* enable IRQ */ 833 entry.mask = 0; /* enable IRQ */
834 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); 834 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
835 835
836 entry.trigger = irq_trigger(idx); 836 entry.trigger = irq_trigger(idx);
837 entry.polarity = irq_polarity(idx); 837 entry.polarity = irq_polarity(idx);
@@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
839 if (irq_trigger(idx)) { 839 if (irq_trigger(idx)) {
840 entry.trigger = 1; 840 entry.trigger = 1;
841 entry.mask = 1; 841 entry.mask = 1;
842 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); 842 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
843 } 843 }
844 844
845 if (!apic && !IO_APIC_IRQ(irq)) 845 if (!apic && !IO_APIC_IRQ(irq))
@@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
851 if (vector < 0) 851 if (vector < 0)
852 return; 852 return;
853 853
854 entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); 854 entry.dest = cpu_mask_to_apicid(mask);
855 entry.vector = vector; 855 entry.vector = vector;
856 856
857 ioapic_register_intr(irq, vector, IOAPIC_AUTO); 857 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
@@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
920 */ 920 */
921 entry.dest_mode = INT_DEST_MODE; 921 entry.dest_mode = INT_DEST_MODE;
922 entry.mask = 0; /* unmask IRQ now */ 922 entry.mask = 0; /* unmask IRQ now */
923 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); 923 entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
924 entry.delivery_mode = INT_DELIVERY_MODE; 924 entry.delivery_mode = INT_DELIVERY_MODE;
925 entry.polarity = 0; 925 entry.polarity = 0;
926 entry.trigger = 0; 926 entry.trigger = 0;
@@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void)
1020 1020
1021 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1021 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1022 1022
1023 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" 1023 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1024 " Stat Dest Deli Vect: \n"); 1024 " Stat Dmod Deli Vect: \n");
1025 1025
1026 for (i = 0; i <= reg_01.bits.entries; i++) { 1026 for (i = 0; i <= reg_01.bits.entries; i++) {
1027 struct IO_APIC_route_entry entry; 1027 struct IO_APIC_route_entry entry;
1028 1028
1029 entry = ioapic_read_entry(apic, i); 1029 entry = ioapic_read_entry(apic, i);
1030 1030
1031 printk(KERN_DEBUG " %02x %03X %02X ", 1031 printk(KERN_DEBUG " %02x %03X ",
1032 i, 1032 i,
1033 entry.dest.logical.logical_dest, 1033 entry.dest
1034 entry.dest.physical.physical_dest
1035 ); 1034 );
1036 1035
1037 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", 1036 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
@@ -1293,8 +1292,7 @@ void disable_IO_APIC(void)
1293 entry.dest_mode = 0; /* Physical */ 1292 entry.dest_mode = 0; /* Physical */
1294 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1293 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1295 entry.vector = 0; 1294 entry.vector = 0;
1296 entry.dest.physical.physical_dest = 1295 entry.dest = GET_APIC_ID(apic_read(APIC_ID));
1297 GET_APIC_ID(apic_read(APIC_ID));
1298 1296
1299 /* 1297 /*
1300 * Add it to the IO-APIC irq-routing table: 1298 * Add it to the IO-APIC irq-routing table:
@@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void)
1556 1554
1557 entry1.dest_mode = 0; /* physical delivery */ 1555 entry1.dest_mode = 0; /* physical delivery */
1558 entry1.mask = 0; /* unmask IRQ now */ 1556 entry1.mask = 0; /* unmask IRQ now */
1559 entry1.dest.physical.physical_dest = hard_smp_processor_id(); 1557 entry1.dest = hard_smp_processor_id();
1560 entry1.delivery_mode = dest_ExtINT; 1558 entry1.delivery_mode = dest_ExtINT;
1561 entry1.polarity = entry0.polarity; 1559 entry1.polarity = entry0.polarity;
1562 entry1.trigger = 0; 1560 entry1.trigger = 0;
@@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
2131 2129
2132 entry.delivery_mode = INT_DELIVERY_MODE; 2130 entry.delivery_mode = INT_DELIVERY_MODE;
2133 entry.dest_mode = INT_DEST_MODE; 2131 entry.dest_mode = INT_DEST_MODE;
2134 entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); 2132 entry.dest = cpu_mask_to_apicid(mask);
2135 entry.trigger = triggering; 2133 entry.trigger = triggering;
2136 entry.polarity = polarity; 2134 entry.polarity = polarity;
2137 entry.mask = 1; /* Disabled (masked) */ 2135 entry.mask = 1; /* Disabled (masked) */
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index fe063d3cfe42..745b1f0f494e 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
114 if (!capable(CAP_SYS_RAWIO)) 114 if (!capable(CAP_SYS_RAWIO))
115 return -EPERM; 115 return -EPERM;
116 } 116 }
117 regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); 117 regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
118 return 0; 118 return 0;
119} 119}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 0c06af6c13bc..3bc30d2c13d3 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -18,6 +18,7 @@
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/smp.h>
21 22
22atomic_t irq_err_count; 23atomic_t irq_err_count;
23 24
@@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
120 121
121 if (likely(irq < NR_IRQS)) 122 if (likely(irq < NR_IRQS))
122 generic_handle_irq(irq); 123 generic_handle_irq(irq);
123 else if (printk_ratelimit()) 124 else {
124 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", 125 if (!disable_apic)
125 __func__, smp_processor_id(), vector); 126 ack_APIC_irq();
127
128 if (printk_ratelimit())
129 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
130 __func__, smp_processor_id(), vector);
131 }
126 132
127 irq_exit(); 133 irq_exit();
128 134
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bdb54a2c9f18..8011a8e1c7d4 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/kmod.h>
22#include <asm/processor.h> 23#include <asm/processor.h>
23#include <asm/msr.h> 24#include <asm/msr.h>
24#include <asm/mce.h> 25#include <asm/mce.h>
@@ -42,6 +43,10 @@ static unsigned long console_logged;
42static int notify_user; 43static int notify_user;
43static int rip_msr; 44static int rip_msr;
44static int mce_bootlog = 1; 45static int mce_bootlog = 1;
46static atomic_t mce_events;
47
48static char trigger[128];
49static char *trigger_argv[2] = { trigger, NULL };
45 50
46/* 51/*
47 * Lockless MCE logging infrastructure. 52 * Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = {
57void mce_log(struct mce *mce) 62void mce_log(struct mce *mce)
58{ 63{
59 unsigned next, entry; 64 unsigned next, entry;
65 atomic_inc(&mce_events);
60 mce->finished = 0; 66 mce->finished = 0;
61 wmb(); 67 wmb();
62 for (;;) { 68 for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
161 } 167 }
162} 168}
163 169
170static void do_mce_trigger(void)
171{
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1);
178 }
179}
180
164/* 181/*
165 * The actual machine check handler 182 * The actual machine check handler
166 */ 183 */
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
234 } 251 }
235 252
236 /* Never do anything final in the polling timer */ 253 /* Never do anything final in the polling timer */
237 if (!regs) 254 if (!regs) {
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
238 goto out; 258 goto out;
259 }
239 260
240 /* If we didn't find an uncorrectable error, pick 261 /* If we didn't find an uncorrectable error, pick
241 the last one (shouldn't happen, just being safe). */ 262 the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
606 } \ 627 } \
607 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 628 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
608 629
630/* TBD should generate these dynamically based on number of available banks */
609ACCESSOR(bank0ctl,bank[0],mce_restart()) 631ACCESSOR(bank0ctl,bank[0],mce_restart())
610ACCESSOR(bank1ctl,bank[1],mce_restart()) 632ACCESSOR(bank1ctl,bank[1],mce_restart())
611ACCESSOR(bank2ctl,bank[2],mce_restart()) 633ACCESSOR(bank2ctl,bank[2],mce_restart())
612ACCESSOR(bank3ctl,bank[3],mce_restart()) 634ACCESSOR(bank3ctl,bank[3],mce_restart())
613ACCESSOR(bank4ctl,bank[4],mce_restart()) 635ACCESSOR(bank4ctl,bank[4],mce_restart())
614ACCESSOR(bank5ctl,bank[5],mce_restart()) 636ACCESSOR(bank5ctl,bank[5],mce_restart())
615static struct sysdev_attribute * bank_attributes[NR_BANKS] = { 637
616 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 638static ssize_t show_trigger(struct sys_device *s, char *buf)
617 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; 639{
640 strcpy(buf, trigger);
641 strcat(buf, "\n");
642 return strlen(trigger) + 1;
643}
644
645static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
646{
647 char *p;
648 int len;
649 strncpy(trigger, buf, sizeof(trigger));
650 trigger[sizeof(trigger)-1] = 0;
651 len = strlen(trigger);
652 p = strchr(trigger, '\n');
653 if (*p) *p = 0;
654 return len;
655}
656
657static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
618ACCESSOR(tolerant,tolerant,) 658ACCESSOR(tolerant,tolerant,)
619ACCESSOR(check_interval,check_interval,mce_restart()) 659ACCESSOR(check_interval,check_interval,mce_restart())
660static struct sysdev_attribute *mce_attributes[] = {
661 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
662 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
663 &attr_tolerant, &attr_check_interval, &attr_trigger,
664 NULL
665};
620 666
621/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ 667/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
622static __cpuinit int mce_create_device(unsigned int cpu) 668static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
632 err = sysdev_register(&per_cpu(device_mce,cpu)); 678 err = sysdev_register(&per_cpu(device_mce,cpu));
633 679
634 if (!err) { 680 if (!err) {
635 for (i = 0; i < banks; i++) 681 for (i = 0; mce_attributes[i]; i++)
636 sysdev_create_file(&per_cpu(device_mce,cpu), 682 sysdev_create_file(&per_cpu(device_mce,cpu),
637 bank_attributes[i]); 683 mce_attributes[i]);
638 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
639 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
640 } 684 }
641 return err; 685 return err;
642} 686}
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu)
645{ 689{
646 int i; 690 int i;
647 691
648 for (i = 0; i < banks; i++) 692 for (i = 0; mce_attributes[i]; i++)
649 sysdev_remove_file(&per_cpu(device_mce,cpu), 693 sysdev_remove_file(&per_cpu(device_mce,cpu),
650 bank_attributes[i]); 694 mce_attributes[i]);
651 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
652 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
653 sysdev_unregister(&per_cpu(device_mce,cpu)); 695 sysdev_unregister(&per_cpu(device_mce,cpu));
654 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); 696 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
655} 697}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 93c707257637..d0bd5d66e103 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -37,6 +37,8 @@
37#define THRESHOLD_MAX 0xFFF 37#define THRESHOLD_MAX 0xFFF
38#define INT_TYPE_APIC 0x00020000 38#define INT_TYPE_APIC 0x00020000
39#define MASK_VALID_HI 0x80000000 39#define MASK_VALID_HI 0x80000000
40#define MASK_CNTP_HI 0x40000000
41#define MASK_LOCKED_HI 0x20000000
40#define MASK_LVTOFF_HI 0x00F00000 42#define MASK_LVTOFF_HI 0x00F00000
41#define MASK_COUNT_EN_HI 0x00080000 43#define MASK_COUNT_EN_HI 0x00080000
42#define MASK_INT_TYPE_HI 0x00060000 44#define MASK_INT_TYPE_HI 0x00060000
@@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
122 for (block = 0; block < NR_BLOCKS; ++block) { 124 for (block = 0; block < NR_BLOCKS; ++block) {
123 if (block == 0) 125 if (block == 0)
124 address = MSR_IA32_MC0_MISC + bank * 4; 126 address = MSR_IA32_MC0_MISC + bank * 4;
125 else if (block == 1) 127 else if (block == 1) {
126 address = MCG_XBLK_ADDR 128 address = (low & MASK_BLKPTR_LO) >> 21;
127 + ((low & MASK_BLKPTR_LO) >> 21); 129 if (!address)
130 break;
131 address += MCG_XBLK_ADDR;
132 }
128 else 133 else
129 ++address; 134 ++address;
130 135
131 if (rdmsr_safe(address, &low, &high)) 136 if (rdmsr_safe(address, &low, &high))
132 continue; 137 break;
133 138
134 if (!(high & MASK_VALID_HI)) { 139 if (!(high & MASK_VALID_HI)) {
135 if (block) 140 if (block)
@@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
138 break; 143 break;
139 } 144 }
140 145
141 if (!(high & MASK_VALID_HI >> 1) || 146 if (!(high & MASK_CNTP_HI) ||
142 (high & MASK_VALID_HI >> 2)) 147 (high & MASK_LOCKED_HI))
143 continue; 148 continue;
144 149
145 if (!block) 150 if (!block)
@@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void)
187 192
188 /* assume first bank caused it */ 193 /* assume first bank caused it */
189 for (bank = 0; bank < NR_BANKS; ++bank) { 194 for (bank = 0; bank < NR_BANKS; ++bank) {
195 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
196 continue;
190 for (block = 0; block < NR_BLOCKS; ++block) { 197 for (block = 0; block < NR_BLOCKS; ++block) {
191 if (block == 0) 198 if (block == 0)
192 address = MSR_IA32_MC0_MISC + bank * 4; 199 address = MSR_IA32_MC0_MISC + bank * 4;
193 else if (block == 1) 200 else if (block == 1) {
194 address = MCG_XBLK_ADDR 201 address = (low & MASK_BLKPTR_LO) >> 21;
195 + ((low & MASK_BLKPTR_LO) >> 21); 202 if (!address)
203 break;
204 address += MCG_XBLK_ADDR;
205 }
196 else 206 else
197 ++address; 207 ++address;
198 208
199 if (rdmsr_safe(address, &low, &high)) 209 if (rdmsr_safe(address, &low, &high))
200 continue; 210 break;
201 211
202 if (!(high & MASK_VALID_HI)) { 212 if (!(high & MASK_VALID_HI)) {
203 if (block) 213 if (block)
@@ -206,10 +216,14 @@ asmlinkage void mce_threshold_interrupt(void)
206 break; 216 break;
207 } 217 }
208 218
209 if (!(high & MASK_VALID_HI >> 1) || 219 if (!(high & MASK_CNTP_HI) ||
210 (high & MASK_VALID_HI >> 2)) 220 (high & MASK_LOCKED_HI))
211 continue; 221 continue;
212 222
223 /* Log the machine check that caused the threshold
224 event. */
225 do_machine_check(NULL, 0);
226
213 if (high & MASK_OVERFLOW_HI) { 227 if (high & MASK_OVERFLOW_HI) {
214 rdmsrl(address, m.misc); 228 rdmsrl(address, m.misc);
215 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, 229 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
@@ -385,7 +399,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
385 return 0; 399 return 0;
386 400
387 if (rdmsr_safe(address, &low, &high)) 401 if (rdmsr_safe(address, &low, &high))
388 goto recurse; 402 return 0;
389 403
390 if (!(high & MASK_VALID_HI)) { 404 if (!(high & MASK_VALID_HI)) {
391 if (block) 405 if (block)
@@ -394,8 +408,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
394 return 0; 408 return 0;
395 } 409 }
396 410
397 if (!(high & MASK_VALID_HI >> 1) || 411 if (!(high & MASK_CNTP_HI) ||
398 (high & MASK_VALID_HI >> 2)) 412 (high & MASK_LOCKED_HI))
399 goto recurse; 413 goto recurse;
400 414
401 b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); 415 b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 9cb42ecb7f89..486f4c61a948 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void)
172{ 172{
173 switch (boot_cpu_data.x86_vendor) { 173 switch (boot_cpu_data.x86_vendor) {
174 case X86_VENDOR_AMD: 174 case X86_VENDOR_AMD:
175 return boot_cpu_data.x86 == 15; 175 return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16;
176 case X86_VENDOR_INTEL: 176 case X86_VENDOR_INTEL:
177 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 177 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
178 return 1; 178 return 1;
@@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data)
214} 214}
215#endif 215#endif
216 216
217static unsigned int adjust_for_32bit_ctr(unsigned int hz)
218{
219 unsigned int retval = hz;
220
221 /*
222 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
223 * are writable, with higher bits sign extending from bit 31.
224 * So, we can only program the counter with 31 bit values and
225 * 32nd bit should be 1, for 33.. to be 1.
226 * Find the appropriate nmi_hz
227 */
228 if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) {
229 retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
230 }
231 return retval;
232}
233
217int __init check_nmi_watchdog (void) 234int __init check_nmi_watchdog (void)
218{ 235{
219 int *counts; 236 int *counts;
@@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void)
268 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 285 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
269 286
270 nmi_hz = 1; 287 nmi_hz = 1;
271 /* 288 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0)
272 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter 289 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
273 * are writable, with higher bits sign extending from bit 31.
274 * So, we can only program the counter with 31 bit values and
275 * 32nd bit should be 1, for 33.. to be 1.
276 * Find the appropriate nmi_hz
277 */
278 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
279 ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
280 nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
281 }
282 } 290 }
283 291
284 kfree(counts); 292 kfree(counts);
@@ -360,6 +368,33 @@ void enable_timer_nmi_watchdog(void)
360 } 368 }
361} 369}
362 370
371static void __acpi_nmi_disable(void *__unused)
372{
373 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
374}
375
376/*
377 * Disable timer based NMIs on all CPUs:
378 */
379void acpi_nmi_disable(void)
380{
381 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
382 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
383}
384
385static void __acpi_nmi_enable(void *__unused)
386{
387 apic_write(APIC_LVT0, APIC_DM_NMI);
388}
389
390/*
391 * Enable timer based NMIs on all CPUs:
392 */
393void acpi_nmi_enable(void)
394{
395 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
396 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
397}
363#ifdef CONFIG_PM 398#ifdef CONFIG_PM
364 399
365static int nmi_pm_active; /* nmi_active before suspend */ 400static int nmi_pm_active; /* nmi_active before suspend */
@@ -634,7 +669,9 @@ static int setup_intel_arch_watchdog(void)
634 669
635 /* setup the timer */ 670 /* setup the timer */
636 wrmsr(evntsel_msr, evntsel, 0); 671 wrmsr(evntsel_msr, evntsel, 0);
637 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); 672
673 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
674 wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
638 675
639 apic_write(APIC_LVTPC, APIC_DM_NMI); 676 apic_write(APIC_LVTPC, APIC_DM_NMI);
640 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 677 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -855,15 +892,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
855 dummy &= ~P4_CCCR_OVF; 892 dummy &= ~P4_CCCR_OVF;
856 wrmsrl(wd->cccr_msr, dummy); 893 wrmsrl(wd->cccr_msr, dummy);
857 apic_write(APIC_LVTPC, APIC_DM_NMI); 894 apic_write(APIC_LVTPC, APIC_DM_NMI);
895 /* start the cycle over again */
896 wrmsrl(wd->perfctr_msr,
897 -((u64)cpu_khz * 1000 / nmi_hz));
858 } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { 898 } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
859 /* 899 /*
860 * ArchPerfom/Core Duo needs to re-unmask 900 * ArchPerfom/Core Duo needs to re-unmask
861 * the apic vector 901 * the apic vector
862 */ 902 */
863 apic_write(APIC_LVTPC, APIC_DM_NMI); 903 apic_write(APIC_LVTPC, APIC_DM_NMI);
904 /* ARCH_PERFMON has 32 bit counter writes */
905 wrmsr(wd->perfctr_msr,
906 (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
907 } else {
908 /* start the cycle over again */
909 wrmsrl(wd->perfctr_msr,
910 -((u64)cpu_khz * 1000 / nmi_hz));
864 } 911 }
865 /* start the cycle over again */
866 wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
867 rc = 1; 912 rc = 1;
868 } else if (nmi_watchdog == NMI_IO_APIC) { 913 } else if (nmi_watchdog == NMI_IO_APIC) {
869 /* don't know how to accurately check for this. 914 /* don't know how to accurately check for this.
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 3d65b1d4c2b3..04480c3b68f5 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = {
138 138
139#define PHB_DEBUG_STUFF_OFFSET 0x0020 139#define PHB_DEBUG_STUFF_OFFSET 0x0020
140 140
141#define EMERGENCY_PAGES 32 /* = 128KB */
142
141unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 143unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
142static int translate_empty_slots __read_mostly = 0; 144static int translate_empty_slots __read_mostly = 0;
143static int calgary_detected __read_mostly = 0; 145static int calgary_detected __read_mostly = 0;
@@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
296{ 298{
297 unsigned long entry; 299 unsigned long entry;
298 unsigned long badbit; 300 unsigned long badbit;
301 unsigned long badend;
302
303 /* were we called with bad_dma_address? */
304 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
305 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
306 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
307 "address 0x%Lx\n", dma_addr);
308 WARN_ON(1);
309 return;
310 }
299 311
300 entry = dma_addr >> PAGE_SHIFT; 312 entry = dma_addr >> PAGE_SHIFT;
301 313
@@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
656 u64 start; 668 u64 start;
657 struct iommu_table *tbl = dev->sysdata; 669 struct iommu_table *tbl = dev->sysdata;
658 670
659 /* reserve bad_dma_address in case it's a legal address */ 671 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
660 iommu_range_reserve(tbl, bad_dma_address, 1); 672 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
661 673
662 /* avoid the BIOS/VGA first 640KB-1MB region */ 674 /* avoid the BIOS/VGA first 640KB-1MB region */
663 start = (640 * 1024); 675 start = (640 * 1024);
@@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void)
1176 } 1188 }
1177 1189
1178 force_iommu = 1; 1190 force_iommu = 1;
1191 bad_dma_address = 0x0;
1179 dma_ops = &calgary_dma_ops; 1192 dma_ops = &calgary_dma_ops;
1180 1193
1181 return 0; 1194 return 0;
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 683b7a5c1ab3..651ccfb06697 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask)
223} 223}
224EXPORT_SYMBOL(dma_set_mask); 224EXPORT_SYMBOL(dma_set_mask);
225 225
226/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] 226/*
227 [,forcesac][,fullflush][,nomerge][,biomerge] 227 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
228 size set size of iommu (in bytes) 228 * documentation.
229 noagp don't initialize the AGP driver and use full aperture. 229 */
230 off don't use the IOMMU
231 leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
232 memaper[=order] allocate an own aperture over RAM with size 32MB^order.
233 noforce don't force IOMMU usage. Default.
234 force Force IOMMU.
235 merge Do lazy merging. This may improve performance on some block devices.
236 Implies force (experimental)
237 biomerge Do merging at the BIO layer. This is more efficient than merge,
238 but should be only done with very big IOMMUs. Implies merge,force.
239 nomerge Don't do SG merging.
240 forcesac For SAC mode for masks <40bits (experimental)
241 fullflush Flush IOMMU on each allocation (default)
242 nofullflush Don't use IOMMU fullflush
243 allowed overwrite iommu off workarounds for specific chipsets.
244 soft Use software bounce buffering (default for Intel machines)
245 noaperture Don't touch the aperture for AGP.
246 allowdac Allow DMA >4GB
247 nodac Forbid DMA >4GB
248 panic Force panic when IOMMU overflows
249*/
250__init int iommu_setup(char *p) 230__init int iommu_setup(char *p)
251{ 231{
252 iommu_merge = 1; 232 iommu_merge = 1;
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index fc1960f1f243..030eb3753358 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
185static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) 185static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
186{ 186{
187 u64 mask = *dev->dma_mask; 187 u64 mask = *dev->dma_mask;
188 int high = addr + size >= mask; 188 int high = addr + size > mask;
189 int mmu = high; 189 int mmu = high;
190 if (force_iommu) 190 if (force_iommu)
191 mmu = 1; 191 mmu = 1;
@@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size
195static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 195static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
196{ 196{
197 u64 mask = *dev->dma_mask; 197 u64 mask = *dev->dma_mask;
198 int high = addr + size >= mask; 198 int high = addr + size > mask;
199 int mmu = high; 199 int mmu = high;
200 return mmu; 200 return mmu;
201} 201}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index addc14af0c56..4326a690a509 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
536 } 536 }
537 ret = 0; 537 ret = 0;
538 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { 538 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
539 ret |= __get_user(tmp, (unsigned long __user *) data); 539 ret = __get_user(tmp, (unsigned long __user *) data);
540 putreg(child, ui, tmp); 540 if (ret)
541 break;
542 ret = putreg(child, ui, tmp);
543 if (ret)
544 break;
541 data += sizeof(long); 545 data += sizeof(long);
542 } 546 }
543 break; 547 break;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 60477244d1a3..3d98b696881d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -138,128 +138,6 @@ struct resource code_resource = {
138 .flags = IORESOURCE_RAM, 138 .flags = IORESOURCE_RAM,
139}; 139};
140 140
141#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
142
143static struct resource system_rom_resource = {
144 .name = "System ROM",
145 .start = 0xf0000,
146 .end = 0xfffff,
147 .flags = IORESOURCE_ROM,
148};
149
150static struct resource extension_rom_resource = {
151 .name = "Extension ROM",
152 .start = 0xe0000,
153 .end = 0xeffff,
154 .flags = IORESOURCE_ROM,
155};
156
157static struct resource adapter_rom_resources[] = {
158 { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
159 .flags = IORESOURCE_ROM },
160 { .name = "Adapter ROM", .start = 0, .end = 0,
161 .flags = IORESOURCE_ROM },
162 { .name = "Adapter ROM", .start = 0, .end = 0,
163 .flags = IORESOURCE_ROM },
164 { .name = "Adapter ROM", .start = 0, .end = 0,
165 .flags = IORESOURCE_ROM },
166 { .name = "Adapter ROM", .start = 0, .end = 0,
167 .flags = IORESOURCE_ROM },
168 { .name = "Adapter ROM", .start = 0, .end = 0,
169 .flags = IORESOURCE_ROM }
170};
171
172static struct resource video_rom_resource = {
173 .name = "Video ROM",
174 .start = 0xc0000,
175 .end = 0xc7fff,
176 .flags = IORESOURCE_ROM,
177};
178
179static struct resource video_ram_resource = {
180 .name = "Video RAM area",
181 .start = 0xa0000,
182 .end = 0xbffff,
183 .flags = IORESOURCE_RAM,
184};
185
186#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
187
188static int __init romchecksum(unsigned char *rom, unsigned long length)
189{
190 unsigned char *p, sum = 0;
191
192 for (p = rom; p < rom + length; p++)
193 sum += *p;
194 return sum == 0;
195}
196
197static void __init probe_roms(void)
198{
199 unsigned long start, length, upper;
200 unsigned char *rom;
201 int i;
202
203 /* video rom */
204 upper = adapter_rom_resources[0].start;
205 for (start = video_rom_resource.start; start < upper; start += 2048) {
206 rom = isa_bus_to_virt(start);
207 if (!romsignature(rom))
208 continue;
209
210 video_rom_resource.start = start;
211
212 /* 0 < length <= 0x7f * 512, historically */
213 length = rom[2] * 512;
214
215 /* if checksum okay, trust length byte */
216 if (length && romchecksum(rom, length))
217 video_rom_resource.end = start + length - 1;
218
219 request_resource(&iomem_resource, &video_rom_resource);
220 break;
221 }
222
223 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
224 if (start < upper)
225 start = upper;
226
227 /* system rom */
228 request_resource(&iomem_resource, &system_rom_resource);
229 upper = system_rom_resource.start;
230
231 /* check for extension rom (ignore length byte!) */
232 rom = isa_bus_to_virt(extension_rom_resource.start);
233 if (romsignature(rom)) {
234 length = extension_rom_resource.end - extension_rom_resource.start + 1;
235 if (romchecksum(rom, length)) {
236 request_resource(&iomem_resource, &extension_rom_resource);
237 upper = extension_rom_resource.start;
238 }
239 }
240
241 /* check for adapter roms on 2k boundaries */
242 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
243 start += 2048) {
244 rom = isa_bus_to_virt(start);
245 if (!romsignature(rom))
246 continue;
247
248 /* 0 < length <= 0x7f * 512, historically */
249 length = rom[2] * 512;
250
251 /* but accept any length that fits if checksum okay */
252 if (!length || start + length > upper || !romchecksum(rom, length))
253 continue;
254
255 adapter_rom_resources[i].start = start;
256 adapter_rom_resources[i].end = start + length - 1;
257 request_resource(&iomem_resource, &adapter_rom_resources[i]);
258
259 start = adapter_rom_resources[i++].end & ~2047UL;
260 }
261}
262
263#ifdef CONFIG_PROC_VMCORE 141#ifdef CONFIG_PROC_VMCORE
264/* elfcorehdr= specifies the location of elf core header 142/* elfcorehdr= specifies the location of elf core header
265 * stored by the crashed kernel. This option will be passed 143 * stored by the crashed kernel. This option will be passed
@@ -444,6 +322,11 @@ void __init setup_arch(char **cmdline_p)
444 /* reserve ebda region */ 322 /* reserve ebda region */
445 if (ebda_addr) 323 if (ebda_addr)
446 reserve_bootmem_generic(ebda_addr, ebda_size); 324 reserve_bootmem_generic(ebda_addr, ebda_size);
325#ifdef CONFIG_NUMA
326 /* reserve nodemap region */
327 if (nodemap_addr)
328 reserve_bootmem_generic(nodemap_addr, nodemap_size);
329#endif
447 330
448#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
449 /* 332 /*
@@ -519,15 +402,11 @@ void __init setup_arch(char **cmdline_p)
519 init_apic_mappings(); 402 init_apic_mappings();
520 403
521 /* 404 /*
522 * Request address space for all standard RAM and ROM resources 405 * We trust e820 completely. No explicit ROM probing in memory.
523 * and also for regions reported as reserved by the e820. 406 */
524 */
525 probe_roms();
526 e820_reserve_resources(); 407 e820_reserve_resources();
527 e820_mark_nosave_regions(); 408 e820_mark_nosave_regions();
528 409
529 request_resource(&iomem_resource, &video_ram_resource);
530
531 { 410 {
532 unsigned i; 411 unsigned i;
533 /* request I/O space for devices used on all i[345]86 PCs */ 412 /* request I/O space for devices used on all i[345]86 PCs */
@@ -1063,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1063 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 942 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1064 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, 943 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
1065 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, 944 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
1066 NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", 945 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
946 "3dnowext", "3dnow",
1067 947
1068 /* Transmeta-defined */ 948 /* Transmeta-defined */
1069 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, 949 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -1081,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1081 /* Intel-defined (#2) */ 961 /* Intel-defined (#2) */
1082 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", 962 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
1083 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, 963 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
1084 NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, 964 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
1085 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 965 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1086 966
1087 /* VIA/Cyrix/Centaur-defined */ 967 /* VIA/Cyrix/Centaur-defined */
@@ -1091,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1091 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 971 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1092 972
1093 /* AMD-defined (#2) */ 973 /* AMD-defined (#2) */
1094 "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, 974 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
1095 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 975 "altmovcr8", "abm", "sse4a",
976 "misalignsse", "3dnowprefetch",
977 "osvw", "ibs", NULL, NULL, NULL, NULL,
1096 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 978 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1097 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 979 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1098 }; 980 };
@@ -1103,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1103 "ttp", /* thermal trip */ 985 "ttp", /* thermal trip */
1104 "tm", 986 "tm",
1105 "stc", 987 "stc",
988 "100mhzsteps",
989 "hwpstate",
990 NULL, /* tsc invariant mapped to constant_tsc */
1106 NULL, 991 NULL,
1107 /* nothing */ /* constant_tsc - moved to flags */ 992 /* nothing */ /* constant_tsc - moved to flags */
1108 }; 993 };
@@ -1219,23 +1104,3 @@ struct seq_operations cpuinfo_op = {
1219 .stop = c_stop, 1104 .stop = c_stop,
1220 .show = show_cpuinfo, 1105 .show = show_cpuinfo,
1221}; 1106};
1222
1223#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
1224#include <linux/platform_device.h>
1225static __init int add_pcspkr(void)
1226{
1227 struct platform_device *pd;
1228 int ret;
1229
1230 pd = platform_device_alloc("pcspkr", -1);
1231 if (!pd)
1232 return -ENOMEM;
1233
1234 ret = platform_device_add(pd);
1235 if (ret)
1236 platform_device_put(pd);
1237
1238 return ret;
1239}
1240device_initcall(add_pcspkr);
1241#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8c4b80fe71a1..6a70b55f719d 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38 38
39unsigned long __supported_pte_mask __read_mostly = ~0UL; 39unsigned long __supported_pte_mask __read_mostly = ~0UL;
40EXPORT_SYMBOL(__supported_pte_mask);
41static int do_not_nx __cpuinitdata = 0; 40static int do_not_nx __cpuinitdata = 0;
42 41
43/* noexec=on|off 42/* noexec=on|off
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 6026b31d037e..65ac2c6b34a6 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr)
32 trace->skip--; 32 trace->skip--;
33 return; 33 return;
34 } 34 }
35 if (trace->nr_entries < trace->max_entries - 1) 35 if (trace->nr_entries < trace->max_entries)
36 trace->entries[trace->nr_entries++] = addr; 36 trace->entries[trace->nr_entries++] = addr;
37} 37}
38 38
@@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = {
49void save_stack_trace(struct stack_trace *trace, struct task_struct *task) 49void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
50{ 50{
51 dump_trace(task, NULL, NULL, &save_stack_ops, trace); 51 dump_trace(task, NULL, NULL, &save_stack_ops, trace);
52 trace->entries[trace->nr_entries++] = ULONG_MAX; 52 if (trace->nr_entries < trace->max_entries)
53 trace->entries[trace->nr_entries++] = ULONG_MAX;
53} 54}
54EXPORT_SYMBOL(save_stack_trace); 55EXPORT_SYMBOL(save_stack_trace);
55 56
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 335cc91c49b7..3cc6886f1fb7 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc);
657 657
658#define TICK_COUNT 100000000 658#define TICK_COUNT 100000000
659#define TICK_MIN 5000 659#define TICK_MIN 5000
660#define MAX_READ_RETRIES 5
660 661
661/* 662/*
662 * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none 663 * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
@@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc);
664 */ 665 */
665static void __init read_hpet_tsc(int *hpet, int *tsc) 666static void __init read_hpet_tsc(int *hpet, int *tsc)
666{ 667{
667 int tsc1, tsc2, hpet1; 668 int tsc1, tsc2, hpet1, retries = 0;
669 static int msg;
668 670
669 do { 671 do {
670 tsc1 = get_cycles_sync(); 672 tsc1 = get_cycles_sync();
671 hpet1 = hpet_readl(HPET_COUNTER); 673 hpet1 = hpet_readl(HPET_COUNTER);
672 tsc2 = get_cycles_sync(); 674 tsc2 = get_cycles_sync();
673 } while (tsc2 - tsc1 > TICK_MIN); 675 } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES);
676 if (retries >= MAX_READ_RETRIES && !msg++)
677 printk(KERN_WARNING
678 "hpet.c: exceeded max retries to read HPET & TSC\n");
674 *hpet = hpet1; 679 *hpet = hpet1;
675 *tsc = tsc2; 680 *tsc = tsc2;
676} 681}
@@ -1221,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void)
1221 if (PIE_on) 1226 if (PIE_on)
1222 PIE_count += lost_ints; 1227 PIE_count += lost_ints;
1223 1228
1224 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", 1229 if (printk_ratelimit())
1225 hpet_rtc_int_freq); 1230 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
1231 hpet_rtc_int_freq);
1226 } 1232 }
1227} 1233}
1228 1234
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e4797a47..0dffae69f4ad 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
26EXPORT_SYMBOL(__put_user_8); 26EXPORT_SYMBOL(__put_user_8);
27 27
28EXPORT_SYMBOL(copy_user_generic); 28EXPORT_SYMBOL(copy_user_generic);
29EXPORT_SYMBOL(__copy_user_nocache);
29EXPORT_SYMBOL(copy_from_user); 30EXPORT_SYMBOL(copy_from_user);
30EXPORT_SYMBOL(copy_to_user); 31EXPORT_SYMBOL(copy_to_user);
31EXPORT_SYMBOL(__copy_from_user_inatomic); 32EXPORT_SYMBOL(__copy_from_user_inatomic);
@@ -34,8 +35,8 @@ EXPORT_SYMBOL(copy_page);
34EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);
35 36
36#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
37extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); 38extern void __write_lock_failed(rwlock_t *rw);
38extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); 39extern void __read_lock_failed(rwlock_t *rw);
39EXPORT_SYMBOL(__write_lock_failed); 40EXPORT_SYMBOL(__write_lock_failed);
40EXPORT_SYMBOL(__read_lock_failed); 41EXPORT_SYMBOL(__read_lock_failed);
41#endif 42#endif
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index b78d4170fce2..8d5f835af481 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ 9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
10 usercopy.o getuser.o putuser.o \ 10 usercopy.o getuser.o putuser.o \
11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o 11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o
12lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o 12lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2.
3 *
4 * Functions to copy from and to user space.
5 */
6
7#include <linux/linkage.h>
8#include <asm/dwarf2.h>
9
10#define FIX_ALIGNMENT 1
11
12#include <asm/current.h>
13#include <asm/asm-offsets.h>
14#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17/*
18 * copy_user_nocache - Uncached memory copy with exception handling
19 * This will force destination/source out of cache for more performance.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 * rcx zero flag when 1 zero on exception
26 *
27 * Output:
28 * eax uncopied bytes or 0 if successful.
29 */
30ENTRY(__copy_user_nocache)
31 CFI_STARTPROC
32 pushq %rbx
33 CFI_ADJUST_CFA_OFFSET 8
34 CFI_REL_OFFSET rbx, 0
35 pushq %rcx /* save zero flag */
36 CFI_ADJUST_CFA_OFFSET 8
37 CFI_REL_OFFSET rcx, 0
38
39 xorl %eax,%eax /* zero for the exception handler */
40
41#ifdef FIX_ALIGNMENT
42 /* check for bad alignment of destination */
43 movl %edi,%ecx
44 andl $7,%ecx
45 jnz .Lbad_alignment
46.Lafter_bad_alignment:
47#endif
48
49 movq %rdx,%rcx
50
51 movl $64,%ebx
52 shrq $6,%rdx
53 decq %rdx
54 js .Lhandle_tail
55
56 .p2align 4
57.Lloop:
58.Ls1: movq (%rsi),%r11
59.Ls2: movq 1*8(%rsi),%r8
60.Ls3: movq 2*8(%rsi),%r9
61.Ls4: movq 3*8(%rsi),%r10
62.Ld1: movnti %r11,(%rdi)
63.Ld2: movnti %r8,1*8(%rdi)
64.Ld3: movnti %r9,2*8(%rdi)
65.Ld4: movnti %r10,3*8(%rdi)
66
67.Ls5: movq 4*8(%rsi),%r11
68.Ls6: movq 5*8(%rsi),%r8
69.Ls7: movq 6*8(%rsi),%r9
70.Ls8: movq 7*8(%rsi),%r10
71.Ld5: movnti %r11,4*8(%rdi)
72.Ld6: movnti %r8,5*8(%rdi)
73.Ld7: movnti %r9,6*8(%rdi)
74.Ld8: movnti %r10,7*8(%rdi)
75
76 dec %rdx
77
78 leaq 64(%rsi),%rsi
79 leaq 64(%rdi),%rdi
80
81 jns .Lloop
82
83 .p2align 4
84.Lhandle_tail:
85 movl %ecx,%edx
86 andl $63,%ecx
87 shrl $3,%ecx
88 jz .Lhandle_7
89 movl $8,%ebx
90 .p2align 4
91.Lloop_8:
92.Ls9: movq (%rsi),%r8
93.Ld9: movnti %r8,(%rdi)
94 decl %ecx
95 leaq 8(%rdi),%rdi
96 leaq 8(%rsi),%rsi
97 jnz .Lloop_8
98
99.Lhandle_7:
100 movl %edx,%ecx
101 andl $7,%ecx
102 jz .Lende
103 .p2align 4
104.Lloop_1:
105.Ls10: movb (%rsi),%bl
106.Ld10: movb %bl,(%rdi)
107 incq %rdi
108 incq %rsi
109 decl %ecx
110 jnz .Lloop_1
111
112 CFI_REMEMBER_STATE
113.Lende:
114 popq %rcx
115 CFI_ADJUST_CFA_OFFSET -8
116 CFI_RESTORE %rcx
117 popq %rbx
118 CFI_ADJUST_CFA_OFFSET -8
119 CFI_RESTORE rbx
120 ret
121 CFI_RESTORE_STATE
122
123#ifdef FIX_ALIGNMENT
124 /* align destination */
125 .p2align 4
126.Lbad_alignment:
127 movl $8,%r9d
128 subl %ecx,%r9d
129 movl %r9d,%ecx
130 cmpq %r9,%rdx
131 jz .Lhandle_7
132 js .Lhandle_7
133.Lalign_1:
134.Ls11: movb (%rsi),%bl
135.Ld11: movb %bl,(%rdi)
136 incq %rsi
137 incq %rdi
138 decl %ecx
139 jnz .Lalign_1
140 subq %r9,%rdx
141 jmp .Lafter_bad_alignment
142#endif
143
144 /* table sorted by exception address */
145 .section __ex_table,"a"
146 .align 8
147 .quad .Ls1,.Ls1e
148 .quad .Ls2,.Ls2e
149 .quad .Ls3,.Ls3e
150 .quad .Ls4,.Ls4e
151 .quad .Ld1,.Ls1e
152 .quad .Ld2,.Ls2e
153 .quad .Ld3,.Ls3e
154 .quad .Ld4,.Ls4e
155 .quad .Ls5,.Ls5e
156 .quad .Ls6,.Ls6e
157 .quad .Ls7,.Ls7e
158 .quad .Ls8,.Ls8e
159 .quad .Ld5,.Ls5e
160 .quad .Ld6,.Ls6e
161 .quad .Ld7,.Ls7e
162 .quad .Ld8,.Ls8e
163 .quad .Ls9,.Le_quad
164 .quad .Ld9,.Le_quad
165 .quad .Ls10,.Le_byte
166 .quad .Ld10,.Le_byte
167#ifdef FIX_ALIGNMENT
168 .quad .Ls11,.Lzero_rest
169 .quad .Ld11,.Lzero_rest
170#endif
171 .quad .Le5,.Le_zero
172 .previous
173
174 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
175 pessimistic side. this is gross. it would be better to fix the
176 interface. */
177 /* eax: zero, ebx: 64 */
178.Ls1e: addl $8,%eax
179.Ls2e: addl $8,%eax
180.Ls3e: addl $8,%eax
181.Ls4e: addl $8,%eax
182.Ls5e: addl $8,%eax
183.Ls6e: addl $8,%eax
184.Ls7e: addl $8,%eax
185.Ls8e: addl $8,%eax
186 addq %rbx,%rdi /* +64 */
187 subq %rax,%rdi /* correct destination with computed offset */
188
189 shlq $6,%rdx /* loop counter * 64 (stride length) */
190 addq %rax,%rdx /* add offset to loopcnt */
191 andl $63,%ecx /* remaining bytes */
192 addq %rcx,%rdx /* add them */
193 jmp .Lzero_rest
194
195 /* exception on quad word loop in tail handling */
196 /* ecx: loopcnt/8, %edx: length, rdi: correct */
197.Le_quad:
198 shll $3,%ecx
199 andl $7,%edx
200 addl %ecx,%edx
201 /* edx: bytes to zero, rdi: dest, eax:zero */
202.Lzero_rest:
203 cmpl $0,(%rsp) /* zero flag set? */
204 jz .Le_zero
205 movq %rdx,%rcx
206.Le_byte:
207 xorl %eax,%eax
208.Le5: rep
209 stosb
210 /* when there is another exception while zeroing the rest just return */
211.Le_zero:
212 movq %rdx,%rax
213 jmp .Lende
214 CFI_ENDPROC
215ENDPROC(__copy_user_nocache)
216
217
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 49e8cf2e06f8..6ada7231f3ab 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
56} 56}
57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); 57EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
58 58
59static inline int notify_page_fault(enum die_val val, const char *str, 59static inline int notify_page_fault(struct pt_regs *regs, long err)
60 struct pt_regs *regs, long err, int trap, int sig)
61{ 60{
62 struct die_args args = { 61 struct die_args args = {
63 .regs = regs, 62 .regs = regs,
64 .str = str, 63 .str = "page fault",
65 .err = err, 64 .err = err,
66 .trapnr = trap, 65 .trapnr = 14,
67 .signr = sig 66 .signr = SIGSEGV
68 }; 67 };
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args); 68 return atomic_notifier_call_chain(&notify_page_fault_chain,
69 DIE_PAGE_FAULT, &args);
70} 70}
71 71
72/* Sometimes the CPU reports invalid exceptions on prefetch. 72/* Sometimes the CPU reports invalid exceptions on prefetch.
@@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
355 if (vmalloc_fault(address) >= 0) 355 if (vmalloc_fault(address) >= 0)
356 return; 356 return;
357 } 357 }
358 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 358 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
359 SIGSEGV) == NOTIFY_STOP)
360 return; 359 return;
361 /* 360 /*
362 * Don't take the mm semaphore here. If we fixup a prefetch 361 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
365 goto bad_area_nosemaphore; 364 goto bad_area_nosemaphore;
366 } 365 }
367 366
368 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 367 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
369 SIGSEGV) == NOTIFY_STOP)
370 return; 368 return;
371 369
372 if (likely(regs->eflags & X86_EFLAGS_IF)) 370 if (likely(regs->eflags & X86_EFLAGS_IF))
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 2ee2e003606c..41b8fb069924 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
37 37
38int numa_off __initdata; 38int numa_off __initdata;
39unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size;
39 41
40 42
41/* 43/*
@@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
52 int res = -1; 54 int res = -1;
53 unsigned long addr, end; 55 unsigned long addr, end;
54 56
55 if (shift >= 64) 57 memset(memnodemap, 0xff, memnodemapsize);
56 return -1;
57 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) { 58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start; 59 addr = nodes[i].start;
60 end = nodes[i].end; 60 end = nodes[i].end;
61 if (addr >= end) 61 if (addr >= end)
62 continue; 62 continue;
63 if ((end >> shift) >= NODEMAPSIZE) 63 if ((end >> shift) >= memnodemapsize)
64 return 0; 64 return 0;
65 do { 65 do {
66 if (memnodemap[addr >> shift] != 0xff) 66 if (memnodemap[addr >> shift] != 0xff)
67 return -1; 67 return -1;
68 memnodemap[addr >> shift] = i; 68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift); 69 addr += (1UL << shift);
70 } while (addr < end); 70 } while (addr < end);
71 res = 1; 71 res = 1;
72 } 72 }
73 return res; 73 return res;
74} 74}
75 75
76int __init compute_hash_shift(struct bootnode *nodes, int numnodes) 76static int __init allocate_cachealigned_memnodemap(void)
77{ 77{
78 int shift = 20; 78 unsigned long pad, pad_addr;
79
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48)
82 return 0;
83
84 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size);
89 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
93 return -1;
94 }
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
97
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
100 return 0;
101}
79 102
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) 103/*
81 shift++; 104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
106 */
107static int __init
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
109{
110 int i, nodes_used = 0;
111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
113
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
116 end = nodes[i].end;
117 if (start >= end)
118 continue;
119 bitfield |= start;
120 nodes_used++;
121 if (end > memtop)
122 memtop = end;
123 }
124 if (nodes_used <= 1)
125 i = 63;
126 else
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128 memnodemapsize = (memtop >> i)+1;
129 return i;
130}
131
132int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
133{
134 int shift;
82 135
136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
138 return -1;
83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
84 shift); 140 shift);
85 141
@@ -216,31 +272,113 @@ void __init numa_init_array(void)
216} 272}
217 273
218#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
219int numa_fake __initdata = 0; 276int numa_fake __initdata = 0;
220 277
221/* Numa emulation */ 278/*
279 * This function is used to find out if the start and end correspond to
280 * different zones.
281 */
282int zone_cross_over(unsigned long start, unsigned long end)
283{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
286 return 1;
287 return 0;
288}
289
222static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
223{ 291{
224 int i; 292 int i, big;
225 struct bootnode nodes[MAX_NUMNODES]; 293 struct bootnode nodes[MAX_NUMNODES];
226 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; 294 unsigned long sz, old_sz;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
227 302
228 /* Kludge needed for the hash function */ 303 /* Kludge needed for the hash function */
229 if (hweight64(sz) > 1) {
230 unsigned long x = 1;
231 while ((x << 1) < sz)
232 x <<= 1;
233 if (x < sz/2)
234 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
235 sz = x;
236 }
237 304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310
311 /*
312 * We ensure that each node is at least 64MB big. Smaller than this
313 * size can cause VM hiccups.
314 */
315 if (sz == 0) {
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
317 "the number of nodes\n", numa_fake);
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
319 printk(KERN_INFO "Number of fake nodes will be = %d\n",
320 numa_fake);
321 sz = FAKE_NODE_MIN_SIZE;
322 }
323 /*
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
325 * This logic ensures the extra memory gets distributed among as many
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
238 memset(&nodes,0,sizeof(nodes)); 333 memset(&nodes,0,sizeof(nodes));
334 end = start;
239 for (i = 0; i < numa_fake; i++) { 335 for (i = 0; i < numa_fake; i++) {
240 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; 336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
241 if (i == numa_fake-1) 348 if (i == numa_fake-1)
242 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 349 sz = max_addr - start;
243 nodes[i].end = nodes[i].start + sz; 350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big)
355 end += FAKE_NODE_MIN_SIZE;
356 /*
357 * Iterate over the range to ensure that this node gets at
358 * least sz amount of RAM (excluding holes)
359 */
360 while ((end - start - e820_hole_size(start, end)) < sz) {
361 end += FAKE_NODE_MIN_SIZE;
362 if (end >= max_addr)
363 break;
364 }
365 /*
366 * Look at the next node to make sure there is some real memory
367 * to map. Bad things happen when the only memory present
368 * in a zone on a fake node is IO hole.
369 */
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
371 if (zone_cross_over(start, end + sz)) {
372 end = (MAX_DMA32_PFN << PAGE_SHIFT);
373 break;
374 }
375 if (end >= max_addr)
376 break;
377 end += FAKE_NODE_MIN_SIZE;
378 }
379 if (end > max_addr)
380 end = max_addr;
381 nodes[i].end = end;
244 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
245 i, 383 i,
246 nodes[i].start, nodes[i].end, 384 nodes[i].start, nodes[i].end,
@@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
290 end_pfn << PAGE_SHIFT); 428 end_pfn << PAGE_SHIFT);
291 /* setup dummy node covering all memory */ 429 /* setup dummy node covering all memory */
292 memnode_shift = 63; 430 memnode_shift = 63;
431 memnodemap = memnode.embedded_map;
293 memnodemap[0] = 0; 432 memnodemap[0] = 0;
294 nodes_clear(node_online_map); 433 nodes_clear(node_online_map);
295 node_set_online(0); 434 node_set_online(0);
@@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void)
321 return pages; 460 return pages;
322} 461}
323 462
324#ifdef CONFIG_SPARSEMEM
325static void __init arch_sparse_init(void)
326{
327 int i;
328
329 for_each_online_node(i)
330 memory_present(i, node_start_pfn(i), node_end_pfn(i));
331
332 sparse_init();
333}
334#else
335#define arch_sparse_init() do {} while (0)
336#endif
337
338void __init paging_init(void) 463void __init paging_init(void)
339{ 464{
340 int i; 465 int i;
@@ -344,7 +469,8 @@ void __init paging_init(void)
344 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 469 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
345 max_zone_pfns[ZONE_NORMAL] = end_pfn; 470 max_zone_pfns[ZONE_NORMAL] = end_pfn;
346 471
347 arch_sparse_init(); 472 sparse_memory_present_with_active_regions(MAX_NUMNODES);
473 sparse_init();
348 474
349 for_each_online_node(i) { 475 for_each_online_node(i) {
350 setup_node_zones(i); 476 setup_node_zones(i);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index ccb91dd996a9..65c5eaa59905 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
107 pud_t *pud; 107 pud_t *pud;
108 pmd_t *pmd; 108 pmd_t *pmd;
109 pte_t large_pte; 109 pte_t large_pte;
110 unsigned long pfn;
110 111
111 pgd = pgd_offset_k(address); 112 pgd = pgd_offset_k(address);
112 BUG_ON(pgd_none(*pgd)); 113 BUG_ON(pgd_none(*pgd));
@@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
114 BUG_ON(pud_none(*pud)); 115 BUG_ON(pud_none(*pud));
115 pmd = pmd_offset(pud, address); 116 pmd = pmd_offset(pud, address);
116 BUG_ON(pmd_val(*pmd) & _PAGE_PSE); 117 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
117 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); 118 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
119 large_pte = pfn_pte(pfn, ref_prot);
118 large_pte = pte_mkhuge(large_pte); 120 large_pte = pte_mkhuge(large_pte);
119 set_pte((pte_t *)pmd, large_pte); 121 set_pte((pte_t *)pmd, large_pte);
120} 122}
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
index 149aba05a5b8..c9eddc8859c0 100644
--- a/arch/x86_64/pci/Makefile
+++ b/arch/x86_64/pci/Makefile
@@ -11,7 +11,7 @@ obj-y += fixup.o init.o
11obj-$(CONFIG_ACPI) += acpi.o 11obj-$(CONFIG_ACPI) += acpi.o
12obj-y += legacy.o irq.o common.o early.o 12obj-y += legacy.o irq.o common.o early.o
13# mmconfig has a 64bit special 13# mmconfig has a 64bit special
14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o 14obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o
15 15
16obj-$(CONFIG_NUMA) += k8-bus.o 16obj-$(CONFIG_NUMA) += k8-bus.o
17 17
@@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o
24i386-y += ../../i386/pci/i386.o 24i386-y += ../../i386/pci/i386.o
25init-y += ../../i386/pci/init.o 25init-y += ../../i386/pci/init.o
26early-y += ../../i386/pci/early.o 26early-y += ../../i386/pci/early.o
27mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index faabb6e87f12..65d82736987e 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -13,16 +13,6 @@
13 13
14#include "pci.h" 14#include "pci.h"
15 15
16/* aperture is up to 256MB but BIOS may reserve less */
17#define MMCONFIG_APER_MIN (2 * 1024*1024)
18#define MMCONFIG_APER_MAX (256 * 1024*1024)
19
20/* Verify the first 16 busses. We assume that systems with more busses
21 get MCFG right. */
22#define MAX_CHECK_BUS 16
23
24static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
25
26/* Static virtual mapping of the MMCONFIG aperture */ 16/* Static virtual mapping of the MMCONFIG aperture */
27struct mmcfg_virt { 17struct mmcfg_virt {
28 struct acpi_mcfg_allocation *cfg; 18 struct acpi_mcfg_allocation *cfg;
@@ -32,30 +22,17 @@ static struct mmcfg_virt *pci_mmcfg_virt;
32 22
33static char __iomem *get_virt(unsigned int seg, unsigned bus) 23static char __iomem *get_virt(unsigned int seg, unsigned bus)
34{ 24{
35 int cfg_num = -1;
36 struct acpi_mcfg_allocation *cfg; 25 struct acpi_mcfg_allocation *cfg;
26 int cfg_num;
37 27
38 while (1) { 28 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
39 ++cfg_num;
40 if (cfg_num >= pci_mmcfg_config_num)
41 break;
42 cfg = pci_mmcfg_virt[cfg_num].cfg; 29 cfg = pci_mmcfg_virt[cfg_num].cfg;
43 if (cfg->pci_segment != seg) 30 if (cfg->pci_segment == seg &&
44 continue; 31 (cfg->start_bus_number <= bus) &&
45 if ((cfg->start_bus_number <= bus) &&
46 (cfg->end_bus_number >= bus)) 32 (cfg->end_bus_number >= bus))
47 return pci_mmcfg_virt[cfg_num].virt; 33 return pci_mmcfg_virt[cfg_num].virt;
48 } 34 }
49 35
50 /* Handle more broken MCFG tables on Asus etc.
51 They only contain a single entry for bus 0-0. Assume
52 this applies to all busses. */
53 cfg = &pci_mmcfg_config[0];
54 if (pci_mmcfg_config_num == 1 &&
55 cfg->pci_segment == 0 &&
56 (cfg->start_bus_number | cfg->end_bus_number) == 0)
57 return pci_mmcfg_virt[0].virt;
58
59 /* Fall back to type 0 */ 36 /* Fall back to type 0 */
60 return NULL; 37 return NULL;
61} 38}
@@ -63,8 +40,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
63static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 40static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
64{ 41{
65 char __iomem *addr; 42 char __iomem *addr;
66 if (seg == 0 && bus < MAX_CHECK_BUS && 43 if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
67 test_bit(32*bus + PCI_SLOT(devfn), fallback_slots)) 44 test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
68 return NULL; 45 return NULL;
69 addr = get_virt(seg, bus); 46 addr = get_virt(seg, bus);
70 if (!addr) 47 if (!addr)
@@ -135,79 +112,46 @@ static struct pci_raw_ops pci_mmcfg = {
135 .write = pci_mmcfg_write, 112 .write = pci_mmcfg_write,
136}; 113};
137 114
138/* K8 systems have some devices (typically in the builtin northbridge) 115static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
139 that are only accessible using type1
140 Normally this can be expressed in the MCFG by not listing them
141 and assigning suitable _SEGs, but this isn't implemented in some BIOS.
142 Instead try to discover all devices on bus 0 that are unreachable using MM
143 and fallback for them. */
144static __init void unreachable_devices(void)
145{ 116{
146 int i, k; 117 void __iomem *addr;
147 /* Use the max bus number from ACPI here? */ 118 u32 size;
148 for (k = 0; k < MAX_CHECK_BUS; k++) { 119
149 for (i = 0; i < 32; i++) { 120 size = (cfg->end_bus_number + 1) << 20;
150 u32 val1; 121 addr = ioremap_nocache(cfg->address, size);
151 char __iomem *addr; 122 if (addr) {
152 123 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
153 pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); 124 cfg->address, cfg->address + size - 1);
154 if (val1 == 0xffffffff)
155 continue;
156 addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
157 if (addr == NULL|| readl(addr) != val1) {
158 set_bit(i + 32*k, fallback_slots);
159 printk(KERN_NOTICE "PCI: No mmconfig possible"
160 " on device %02x:%02x\n", k, i);
161 }
162 }
163 } 125 }
126 return addr;
164} 127}
165 128
166void __init pci_mmcfg_init(int type) 129int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
130 unsigned int devfn)
167{ 131{
168 int i; 132 return pci_dev_base(seg, bus, devfn) != NULL;
169 133}
170 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
171 return;
172
173 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
174 if ((pci_mmcfg_config_num == 0) ||
175 (pci_mmcfg_config == NULL) ||
176 (pci_mmcfg_config[0].address == 0))
177 return;
178
179 /* Only do this check when type 1 works. If it doesn't work
180 assume we run on a Mac and always use MCFG */
181 if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
182 pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
183 E820_RESERVED)) {
184 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
185 (unsigned long)pci_mmcfg_config[0].address);
186 printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
187 return;
188 }
189 134
190 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); 135int __init pci_mmcfg_arch_init(void)
136{
137 int i;
138 pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
139 pci_mmcfg_config_num, GFP_KERNEL);
191 if (pci_mmcfg_virt == NULL) { 140 if (pci_mmcfg_virt == NULL) {
192 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); 141 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
193 return; 142 return 0;
194 } 143 }
144
195 for (i = 0; i < pci_mmcfg_config_num; ++i) { 145 for (i = 0; i < pci_mmcfg_config_num; ++i) {
196 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; 146 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
197 pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address, 147 pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
198 MMCONFIG_APER_MAX);
199 if (!pci_mmcfg_virt[i].virt) { 148 if (!pci_mmcfg_virt[i].virt) {
200 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " 149 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
201 "segment %d\n", 150 "segment %d\n",
202 pci_mmcfg_config[i].pci_segment); 151 pci_mmcfg_config[i].pci_segment);
203 return; 152 return 0;
204 } 153 }
205 printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n",
206 (unsigned long)pci_mmcfg_config[i].address);
207 } 154 }
208
209 unreachable_devices();
210
211 raw_pci_ops = &pci_mmcfg; 155 raw_pci_ops = &pci_mmcfg;
212 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 156 return 1;
213} 157}
diff --git a/drivers/acpi/namespace/nsinit.c b/drivers/acpi/namespace/nsinit.c
index 326af8fc0ce7..33db2241044e 100644
--- a/drivers/acpi/namespace/nsinit.c
+++ b/drivers/acpi/namespace/nsinit.c
@@ -45,6 +45,7 @@
45#include <acpi/acnamesp.h> 45#include <acpi/acnamesp.h>
46#include <acpi/acdispat.h> 46#include <acpi/acdispat.h>
47#include <acpi/acinterp.h> 47#include <acpi/acinterp.h>
48#include <linux/nmi.h>
48 49
49#define _COMPONENT ACPI_NAMESPACE 50#define _COMPONENT ACPI_NAMESPACE
50ACPI_MODULE_NAME("nsinit") 51ACPI_MODULE_NAME("nsinit")
@@ -534,7 +535,15 @@ acpi_ns_init_one_device(acpi_handle obj_handle,
534 info->parameter_type = ACPI_PARAM_ARGS; 535 info->parameter_type = ACPI_PARAM_ARGS;
535 info->flags = ACPI_IGNORE_RETURN_VALUE; 536 info->flags = ACPI_IGNORE_RETURN_VALUE;
536 537
538 /*
539 * Some hardware relies on this being executed as atomically
540 * as possible (without an NMI being received in the middle of
541 * this) - so disable NMIs and initialize the device:
542 */
543 acpi_nmi_disable();
537 status = acpi_ns_evaluate(info); 544 status = acpi_ns_evaluate(info);
545 acpi_nmi_enable();
546
538 if (ACPI_SUCCESS(status)) { 547 if (ACPI_SUCCESS(status)) {
539 walk_info->num_INI++; 548 walk_info->num_INI++;
540 549
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 1e640b899175..fd4e91734388 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1879,12 +1879,6 @@ again:
1879 1879
1880 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 1880 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
1881 1881
1882 /*
1883 * Profile KVM exit RIPs:
1884 */
1885 if (unlikely(prof_on == KVM_PROFILING))
1886 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
1887
1888 kvm_run->exit_type = 0; 1882 kvm_run->exit_type = 0;
1889 if (fail) { 1883 if (fail) {
1890 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; 1884 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
@@ -1907,6 +1901,12 @@ again:
1907 1901
1908 reload_tss(); 1902 reload_tss();
1909 } 1903 }
1904 /*
1905 * Profile KVM exit RIPs:
1906 */
1907 if (unlikely(prof_on == KVM_PROFILING))
1908 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
1909
1910 vcpu->launched = 1; 1910 vcpu->launched = 1;
1911 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; 1911 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1912 r = kvm_handle_exit(kvm_run, vcpu); 1912 r = kvm_handle_exit(kvm_run, vcpu);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 669dbe5b0317..51db1182b27e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -76,7 +76,8 @@ static struct linux_binfmt elf_format = {
76 .load_binary = load_elf_binary, 76 .load_binary = load_elf_binary,
77 .load_shlib = load_elf_library, 77 .load_shlib = load_elf_library,
78 .core_dump = elf_core_dump, 78 .core_dump = elf_core_dump,
79 .min_coredump = ELF_EXEC_PAGESIZE 79 .min_coredump = ELF_EXEC_PAGESIZE,
80 .hasvdso = 1
80}; 81};
81 82
82#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) 83#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 9d774d07d95b..00c23433b39f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -183,6 +183,19 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
183#endif 183#endif
184 184
185/* 185/*
186 * A facility to provide batching of the reload of page tables with the
187 * actual context switch code for paravirtualized guests. By convention,
188 * only one of the lazy modes (CPU, MMU) should be active at any given
189 * time, entry should never be nested, and entry and exits should always
190 * be paired. This is for sanity of maintaining and reasoning about the
191 * kernel code.
192 */
193#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE
194#define arch_enter_lazy_cpu_mode() do {} while (0)
195#define arch_leave_lazy_cpu_mode() do {} while (0)
196#endif
197
198/*
186 * When walking page tables, get the address of the next boundary, 199 * When walking page tables, get the address of the next boundary,
187 * or the end address of the range if that comes earlier. Although no 200 * or the end address of the range if that comes earlier. Although no
188 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. 201 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index 41a44319905f..3a61206fd108 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -43,6 +43,8 @@ extern void generic_apic_probe(void);
43#define apic_write native_apic_write 43#define apic_write native_apic_write
44#define apic_write_atomic native_apic_write_atomic 44#define apic_write_atomic native_apic_write_atomic
45#define apic_read native_apic_read 45#define apic_read native_apic_read
46#define setup_boot_clock setup_boot_APIC_clock
47#define setup_secondary_clock setup_secondary_APIC_clock
46#endif 48#endif
47 49
48static __inline fastcall void native_apic_write(unsigned long reg, 50static __inline fastcall void native_apic_write(unsigned long reg,
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 38f1aebbbdb5..c90c7c499302 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -160,7 +160,7 @@ static void __init check_config(void)
160 * If we configured ourselves for a TSC, we'd better have one! 160 * If we configured ourselves for a TSC, we'd better have one!
161 */ 161 */
162#ifdef CONFIG_X86_TSC 162#ifdef CONFIG_X86_TSC
163 if (!cpu_has_tsc) 163 if (!cpu_has_tsc && !tsc_disable)
164 panic("Kernel compiled for Pentium+, requires TSC feature!"); 164 panic("Kernel compiled for Pentium+, requires TSC feature!");
165#endif 165#endif
166 166
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index f398cc456448..050831f34f71 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -22,7 +22,7 @@ struct Xgt_desc_struct {
22 22
23extern struct Xgt_desc_struct idt_descr; 23extern struct Xgt_desc_struct idt_descr;
24DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 24DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
25 25extern struct Xgt_desc_struct early_gdt_descr;
26 26
27static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) 27static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
28{ 28{
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 369035dfe4b6..8d33c9bb7c1c 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -90,8 +90,8 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
90 pr_reg[6] = regs->eax; \ 90 pr_reg[6] = regs->eax; \
91 pr_reg[7] = regs->xds; \ 91 pr_reg[7] = regs->xds; \
92 pr_reg[8] = regs->xes; \ 92 pr_reg[8] = regs->xes; \
93 savesegment(fs,pr_reg[9]); \ 93 pr_reg[9] = regs->xfs; \
94 pr_reg[10] = regs->xgs; \ 94 savesegment(gs,pr_reg[10]); \
95 pr_reg[11] = regs->orig_eax; \ 95 pr_reg[11] = regs->orig_eax; \
96 pr_reg[12] = regs->eip; \ 96 pr_reg[12] = regs->eip; \
97 pr_reg[13] = regs->xcs; \ 97 pr_reg[13] = regs->xcs; \
diff --git a/include/asm-i386/idle.h b/include/asm-i386/idle.h
new file mode 100644
index 000000000000..87ab93911199
--- /dev/null
+++ b/include/asm-i386/idle.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_I386_IDLE_H
2#define _ASM_I386_IDLE_H 1
3
4#define IDLE_START 1
5#define IDLE_END 2
6
7struct notifier_block;
8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n);
10
11void exit_idle(void);
12void enter_idle(void);
13
14#endif
diff --git a/include/asm-i386/mce.h b/include/asm-i386/mce.h
index 7cc1a973bf00..b0a02ee34ffd 100644
--- a/include/asm-i386/mce.h
+++ b/include/asm-i386/mce.h
@@ -3,3 +3,5 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
3#else 3#else
4#define mcheck_init(c) do {} while(0) 4#define mcheck_init(c) do {} while(0)
5#endif 5#endif
6
7extern int mce_disabled;
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index 68ff102d6f5e..e6aa30f8de5b 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -63,7 +63,7 @@ static inline void switch_mm(struct mm_struct *prev,
63} 63}
64 64
65#define deactivate_mm(tsk, mm) \ 65#define deactivate_mm(tsk, mm) \
66 asm("movl %0,%%fs": :"r" (0)); 66 asm("movl %0,%%gs": :"r" (0));
67 67
68#define activate_mm(prev, next) \ 68#define activate_mm(prev, next) \
69 switch_mm((prev),(next),NULL) 69 switch_mm((prev),(next),NULL)
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 9f06265065f4..6317e0a4d735 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -59,90 +59,102 @@ struct paravirt_ops
59 convention. This makes it easier to implement inline 59 convention. This makes it easier to implement inline
60 assembler replacements. */ 60 assembler replacements. */
61 61
62 void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, 62 void (*cpuid)(unsigned int *eax, unsigned int *ebx,
63 unsigned int *ecx, unsigned int *edx); 63 unsigned int *ecx, unsigned int *edx);
64 64
65 unsigned long (fastcall *get_debugreg)(int regno); 65 unsigned long (*get_debugreg)(int regno);
66 void (fastcall *set_debugreg)(int regno, unsigned long value); 66 void (*set_debugreg)(int regno, unsigned long value);
67 67
68 void (fastcall *clts)(void); 68 void (*clts)(void);
69 69
70 unsigned long (fastcall *read_cr0)(void); 70 unsigned long (*read_cr0)(void);
71 void (fastcall *write_cr0)(unsigned long); 71 void (*write_cr0)(unsigned long);
72 72
73 unsigned long (fastcall *read_cr2)(void); 73 unsigned long (*read_cr2)(void);
74 void (fastcall *write_cr2)(unsigned long); 74 void (*write_cr2)(unsigned long);
75 75
76 unsigned long (fastcall *read_cr3)(void); 76 unsigned long (*read_cr3)(void);
77 void (fastcall *write_cr3)(unsigned long); 77 void (*write_cr3)(unsigned long);
78 78
79 unsigned long (fastcall *read_cr4_safe)(void); 79 unsigned long (*read_cr4_safe)(void);
80 unsigned long (fastcall *read_cr4)(void); 80 unsigned long (*read_cr4)(void);
81 void (fastcall *write_cr4)(unsigned long); 81 void (*write_cr4)(unsigned long);
82 82
83 unsigned long (fastcall *save_fl)(void); 83 unsigned long (*save_fl)(void);
84 void (fastcall *restore_fl)(unsigned long); 84 void (*restore_fl)(unsigned long);
85 void (fastcall *irq_disable)(void); 85 void (*irq_disable)(void);
86 void (fastcall *irq_enable)(void); 86 void (*irq_enable)(void);
87 void (fastcall *safe_halt)(void); 87 void (*safe_halt)(void);
88 void (fastcall *halt)(void); 88 void (*halt)(void);
89 void (fastcall *wbinvd)(void); 89 void (*wbinvd)(void);
90 90
91 /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ 91 /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
92 u64 (fastcall *read_msr)(unsigned int msr, int *err); 92 u64 (*read_msr)(unsigned int msr, int *err);
93 int (fastcall *write_msr)(unsigned int msr, u64 val); 93 int (*write_msr)(unsigned int msr, u64 val);
94 94
95 u64 (fastcall *read_tsc)(void); 95 u64 (*read_tsc)(void);
96 u64 (fastcall *read_pmc)(void); 96 u64 (*read_pmc)(void);
97 97
98 void (fastcall *load_tr_desc)(void); 98 void (*load_tr_desc)(void);
99 void (fastcall *load_gdt)(const struct Xgt_desc_struct *); 99 void (*load_gdt)(const struct Xgt_desc_struct *);
100 void (fastcall *load_idt)(const struct Xgt_desc_struct *); 100 void (*load_idt)(const struct Xgt_desc_struct *);
101 void (fastcall *store_gdt)(struct Xgt_desc_struct *); 101 void (*store_gdt)(struct Xgt_desc_struct *);
102 void (fastcall *store_idt)(struct Xgt_desc_struct *); 102 void (*store_idt)(struct Xgt_desc_struct *);
103 void (fastcall *set_ldt)(const void *desc, unsigned entries); 103 void (*set_ldt)(const void *desc, unsigned entries);
104 unsigned long (fastcall *store_tr)(void); 104 unsigned long (*store_tr)(void);
105 void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); 105 void (*load_tls)(struct thread_struct *t, unsigned int cpu);
106 void (fastcall *write_ldt_entry)(void *dt, int entrynum, 106 void (*write_ldt_entry)(void *dt, int entrynum,
107 u32 low, u32 high); 107 u32 low, u32 high);
108 void (fastcall *write_gdt_entry)(void *dt, int entrynum, 108 void (*write_gdt_entry)(void *dt, int entrynum,
109 u32 low, u32 high); 109 u32 low, u32 high);
110 void (fastcall *write_idt_entry)(void *dt, int entrynum, 110 void (*write_idt_entry)(void *dt, int entrynum,
111 u32 low, u32 high); 111 u32 low, u32 high);
112 void (fastcall *load_esp0)(struct tss_struct *tss, 112 void (*load_esp0)(struct tss_struct *tss,
113 struct thread_struct *thread); 113 struct thread_struct *thread);
114 114
115 void (fastcall *set_iopl_mask)(unsigned mask); 115 void (*set_iopl_mask)(unsigned mask);
116 116
117 void (fastcall *io_delay)(void); 117 void (*io_delay)(void);
118 void (*const_udelay)(unsigned long loops); 118 void (*const_udelay)(unsigned long loops);
119 119
120#ifdef CONFIG_X86_LOCAL_APIC 120#ifdef CONFIG_X86_LOCAL_APIC
121 void (fastcall *apic_write)(unsigned long reg, unsigned long v); 121 void (*apic_write)(unsigned long reg, unsigned long v);
122 void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); 122 void (*apic_write_atomic)(unsigned long reg, unsigned long v);
123 unsigned long (fastcall *apic_read)(unsigned long reg); 123 unsigned long (*apic_read)(unsigned long reg);
124 void (*setup_boot_clock)(void);
125 void (*setup_secondary_clock)(void);
124#endif 126#endif
125 127
126 void (fastcall *flush_tlb_user)(void); 128 void (*flush_tlb_user)(void);
127 void (fastcall *flush_tlb_kernel)(void); 129 void (*flush_tlb_kernel)(void);
128 void (fastcall *flush_tlb_single)(u32 addr); 130 void (*flush_tlb_single)(u32 addr);
129 131
130 void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); 132 void (*alloc_pt)(u32 pfn);
131 void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); 133 void (*alloc_pd)(u32 pfn);
132 void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); 134 void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
133 void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep); 135 void (*release_pt)(u32 pfn);
134 void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep); 136 void (*release_pd)(u32 pfn);
137
138 void (*set_pte)(pte_t *ptep, pte_t pteval);
139 void (*set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
140 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
141 void (*pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep);
142 void (*pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep);
135#ifdef CONFIG_X86_PAE 143#ifdef CONFIG_X86_PAE
136 void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval); 144 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
137 void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); 145 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
138 void (fastcall *set_pud)(pud_t *pudp, pud_t pudval); 146 void (*set_pud)(pud_t *pudp, pud_t pudval);
139 void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 147 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
140 void (fastcall *pmd_clear)(pmd_t *pmdp); 148 void (*pmd_clear)(pmd_t *pmdp);
141#endif 149#endif
142 150
151 void (*set_lazy_mode)(int mode);
152
143 /* These two are jmp to, not actually called. */ 153 /* These two are jmp to, not actually called. */
144 void (fastcall *irq_enable_sysexit)(void); 154 void (*irq_enable_sysexit)(void);
145 void (fastcall *iret)(void); 155 void (*iret)(void);
156
157 void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp);
146}; 158};
147 159
148/* Mark a paravirt probe function. */ 160/* Mark a paravirt probe function. */
@@ -313,13 +325,38 @@ static inline unsigned long apic_read(unsigned long reg)
313{ 325{
314 return paravirt_ops.apic_read(reg); 326 return paravirt_ops.apic_read(reg);
315} 327}
328
329static inline void setup_boot_clock(void)
330{
331 paravirt_ops.setup_boot_clock();
332}
333
334static inline void setup_secondary_clock(void)
335{
336 paravirt_ops.setup_secondary_clock();
337}
316#endif 338#endif
317 339
340#ifdef CONFIG_SMP
341static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
342 unsigned long start_esp)
343{
344 return paravirt_ops.startup_ipi_hook(phys_apicid, start_eip, start_esp);
345}
346#endif
318 347
319#define __flush_tlb() paravirt_ops.flush_tlb_user() 348#define __flush_tlb() paravirt_ops.flush_tlb_user()
320#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() 349#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
321#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) 350#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
322 351
352#define paravirt_alloc_pt(pfn) paravirt_ops.alloc_pt(pfn)
353#define paravirt_release_pt(pfn) paravirt_ops.release_pt(pfn)
354
355#define paravirt_alloc_pd(pfn) paravirt_ops.alloc_pd(pfn)
356#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) \
357 paravirt_ops.alloc_pd_clone(pfn, clonepfn, start, count)
358#define paravirt_release_pd(pfn) paravirt_ops.release_pd(pfn)
359
323static inline void set_pte(pte_t *ptep, pte_t pteval) 360static inline void set_pte(pte_t *ptep, pte_t pteval)
324{ 361{
325 paravirt_ops.set_pte(ptep, pteval); 362 paravirt_ops.set_pte(ptep, pteval);
@@ -372,6 +409,19 @@ static inline void pmd_clear(pmd_t *pmdp)
372} 409}
373#endif 410#endif
374 411
412/* Lazy mode for batching updates / context switch */
413#define PARAVIRT_LAZY_NONE 0
414#define PARAVIRT_LAZY_MMU 1
415#define PARAVIRT_LAZY_CPU 2
416
417#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
418#define arch_enter_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_CPU)
419#define arch_leave_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE)
420
421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
422#define arch_enter_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_MMU)
423#define arch_leave_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE)
424
375/* These all sit in the .parainstructions section to tell us what to patch. */ 425/* These all sit in the .parainstructions section to tell us what to patch. */
376struct paravirt_patch { 426struct paravirt_patch {
377 u8 *instr; /* original instructions */ 427 u8 *instr; /* original instructions */
diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h
index 2ba2736aa109..b12d59a318b7 100644
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -39,19 +39,19 @@ extern struct i386_pda _proxy_pda;
39 if (0) { T__ tmp__; tmp__ = (val); } \ 39 if (0) { T__ tmp__; tmp__ = (val); } \
40 switch (sizeof(_proxy_pda.field)) { \ 40 switch (sizeof(_proxy_pda.field)) { \
41 case 1: \ 41 case 1: \
42 asm(op "b %1,%%gs:%c2" \ 42 asm(op "b %1,%%fs:%c2" \
43 : "+m" (_proxy_pda.field) \ 43 : "+m" (_proxy_pda.field) \
44 :"ri" ((T__)val), \ 44 :"ri" ((T__)val), \
45 "i"(pda_offset(field))); \ 45 "i"(pda_offset(field))); \
46 break; \ 46 break; \
47 case 2: \ 47 case 2: \
48 asm(op "w %1,%%gs:%c2" \ 48 asm(op "w %1,%%fs:%c2" \
49 : "+m" (_proxy_pda.field) \ 49 : "+m" (_proxy_pda.field) \
50 :"ri" ((T__)val), \ 50 :"ri" ((T__)val), \
51 "i"(pda_offset(field))); \ 51 "i"(pda_offset(field))); \
52 break; \ 52 break; \
53 case 4: \ 53 case 4: \
54 asm(op "l %1,%%gs:%c2" \ 54 asm(op "l %1,%%fs:%c2" \
55 : "+m" (_proxy_pda.field) \ 55 : "+m" (_proxy_pda.field) \
56 :"ri" ((T__)val), \ 56 :"ri" ((T__)val), \
57 "i"(pda_offset(field))); \ 57 "i"(pda_offset(field))); \
@@ -65,19 +65,19 @@ extern struct i386_pda _proxy_pda;
65 typeof(_proxy_pda.field) ret__; \ 65 typeof(_proxy_pda.field) ret__; \
66 switch (sizeof(_proxy_pda.field)) { \ 66 switch (sizeof(_proxy_pda.field)) { \
67 case 1: \ 67 case 1: \
68 asm(op "b %%gs:%c1,%0" \ 68 asm(op "b %%fs:%c1,%0" \
69 : "=r" (ret__) \ 69 : "=r" (ret__) \
70 : "i" (pda_offset(field)), \ 70 : "i" (pda_offset(field)), \
71 "m" (_proxy_pda.field)); \ 71 "m" (_proxy_pda.field)); \
72 break; \ 72 break; \
73 case 2: \ 73 case 2: \
74 asm(op "w %%gs:%c1,%0" \ 74 asm(op "w %%fs:%c1,%0" \
75 : "=r" (ret__) \ 75 : "=r" (ret__) \
76 : "i" (pda_offset(field)), \ 76 : "i" (pda_offset(field)), \
77 "m" (_proxy_pda.field)); \ 77 "m" (_proxy_pda.field)); \
78 break; \ 78 break; \
79 case 4: \ 79 case 4: \
80 asm(op "l %%gs:%c1,%0" \ 80 asm(op "l %%fs:%c1,%0" \
81 : "=r" (ret__) \ 81 : "=r" (ret__) \
82 : "i" (pda_offset(field)), \ 82 : "i" (pda_offset(field)), \
83 "m" (_proxy_pda.field)); \ 83 "m" (_proxy_pda.field)); \
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index 4b1e61359f89..c8dc2d0141a7 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -5,13 +5,31 @@
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/mm.h> /* for struct page */ 6#include <linux/mm.h> /* for struct page */
7 7
8#define pmd_populate_kernel(mm, pmd, pte) \ 8#ifdef CONFIG_PARAVIRT
9 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) 9#include <asm/paravirt.h>
10#else
11#define paravirt_alloc_pt(pfn) do { } while (0)
12#define paravirt_alloc_pd(pfn) do { } while (0)
13#define paravirt_alloc_pd(pfn) do { } while (0)
14#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
15#define paravirt_release_pt(pfn) do { } while (0)
16#define paravirt_release_pd(pfn) do { } while (0)
17#endif
18
19#define pmd_populate_kernel(mm, pmd, pte) \
20do { \
21 paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \
22 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
23} while (0)
10 24
11#define pmd_populate(mm, pmd, pte) \ 25#define pmd_populate(mm, pmd, pte) \
26do { \
27 paravirt_alloc_pt(page_to_pfn(pte)); \
12 set_pmd(pmd, __pmd(_PAGE_TABLE + \ 28 set_pmd(pmd, __pmd(_PAGE_TABLE + \
13 ((unsigned long long)page_to_pfn(pte) << \ 29 ((unsigned long long)page_to_pfn(pte) << \
14 (unsigned long long) PAGE_SHIFT))) 30 (unsigned long long) PAGE_SHIFT))); \
31} while (0)
32
15/* 33/*
16 * Allocate and free page tables. 34 * Allocate and free page tables.
17 */ 35 */
@@ -32,7 +50,11 @@ static inline void pte_free(struct page *pte)
32} 50}
33 51
34 52
35#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) 53#define __pte_free_tlb(tlb,pte) \
54do { \
55 paravirt_release_pt(page_to_pfn(pte)); \
56 tlb_remove_page((tlb),(pte)); \
57} while (0)
36 58
37#ifdef CONFIG_X86_PAE 59#ifdef CONFIG_X86_PAE
38/* 60/*
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 359f10b54f59..edfbe46a5e13 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -257,6 +257,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
257 : :"a" (eax), "c" (ecx)); 257 : :"a" (eax), "c" (ecx));
258} 258}
259 259
260static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
261{
262 /* "mwait %eax,%ecx;" */
263 asm volatile(
264 "sti; .byte 0x0f,0x01,0xc9;"
265 : :"a" (eax), "c" (ecx));
266}
267
260extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); 268extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
261 269
262/* from system description table in BIOS. Mostly for MCA use, but 270/* from system description table in BIOS. Mostly for MCA use, but
@@ -424,7 +432,7 @@ struct thread_struct {
424 .vm86_info = NULL, \ 432 .vm86_info = NULL, \
425 .sysenter_cs = __KERNEL_CS, \ 433 .sysenter_cs = __KERNEL_CS, \
426 .io_bitmap_ptr = NULL, \ 434 .io_bitmap_ptr = NULL, \
427 .gs = __KERNEL_PDA, \ 435 .fs = __KERNEL_PDA, \
428} 436}
429 437
430/* 438/*
@@ -442,8 +450,8 @@ struct thread_struct {
442} 450}
443 451
444#define start_thread(regs, new_eip, new_esp) do { \ 452#define start_thread(regs, new_eip, new_esp) do { \
445 __asm__("movl %0,%%fs": :"r" (0)); \ 453 __asm__("movl %0,%%gs": :"r" (0)); \
446 regs->xgs = 0; \ 454 regs->xfs = 0; \
447 set_fs(USER_DS); \ 455 set_fs(USER_DS); \
448 regs->xds = __USER_DS; \ 456 regs->xds = __USER_DS; \
449 regs->xes = __USER_DS; \ 457 regs->xes = __USER_DS; \
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index bdbc894339b4..6002597b9e12 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -16,8 +16,8 @@ struct pt_regs {
16 long eax; 16 long eax;
17 int xds; 17 int xds;
18 int xes; 18 int xes;
19 /* int xfs; */ 19 int xfs;
20 int xgs; 20 /* int xgs; */
21 long orig_eax; 21 long orig_eax;
22 long eip; 22 long eip;
23 int xcs; 23 int xcs;
@@ -49,6 +49,10 @@ static inline int user_mode_vm(struct pt_regs *regs)
49{ 49{
50 return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL; 50 return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL;
51} 51}
52static inline int v8086_mode(struct pt_regs *regs)
53{
54 return (regs->eflags & VM_MASK);
55}
52 56
53#define instruction_pointer(regs) ((regs)->eip) 57#define instruction_pointer(regs) ((regs)->eip)
54#define regs_return_value(regs) ((regs)->eax) 58#define regs_return_value(regs) ((regs)->eax)
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h
index 3c796af33776..065f10bfa487 100644
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -83,14 +83,8 @@
83 * The GDT has 32 entries 83 * The GDT has 32 entries
84 */ 84 */
85#define GDT_ENTRIES 32 85#define GDT_ENTRIES 32
86
87#define GDT_SIZE (GDT_ENTRIES * 8) 86#define GDT_SIZE (GDT_ENTRIES * 8)
88 87
89/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
90#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
91/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
92#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
93
94/* Simple and small GDT entries for booting only */ 88/* Simple and small GDT entries for booting only */
95 89
96#define GDT_ENTRY_BOOT_CS 2 90#define GDT_ENTRY_BOOT_CS 2
@@ -134,4 +128,17 @@
134#ifndef CONFIG_PARAVIRT 128#ifndef CONFIG_PARAVIRT
135#define get_kernel_rpl() 0 129#define get_kernel_rpl() 0
136#endif 130#endif
131/*
132 * Matching rules for certain types of segments.
133 */
134
135/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
136#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8)
137
138/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
139#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
140
141/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
142#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
143
137#endif 144#endif
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 76316275d6f9..0e8077cbfdac 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -77,6 +77,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map);
77void __init add_memory_region(unsigned long long start, 77void __init add_memory_region(unsigned long long start,
78 unsigned long long size, int type); 78 unsigned long long size, int type);
79 79
80extern unsigned long init_pg_tables_end;
81
80#endif /* __ASSEMBLY__ */ 82#endif /* __ASSEMBLY__ */
81 83
82#endif /* __KERNEL__ */ 84#endif /* __KERNEL__ */
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 64fe624c02ca..6bf0033a301c 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -52,6 +52,11 @@ extern void cpu_exit_clear(void);
52extern void cpu_uninit(void); 52extern void cpu_uninit(void);
53#endif 53#endif
54 54
55#ifndef CONFIG_PARAVIRT
56#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
57do { } while (0)
58#endif
59
55/* 60/*
56 * This function is needed by all SMP systems. It must _always_ be valid 61 * This function is needed by all SMP systems. It must _always_ be valid
57 * from the initial startup. We map APIC_BASE very early in page_setup(), 62 * from the initial startup. We map APIC_BASE very early in page_setup(),
diff --git a/include/asm-i386/time.h b/include/asm-i386/time.h
index ea8065af825a..571b4294dc2e 100644
--- a/include/asm-i386/time.h
+++ b/include/asm-i386/time.h
@@ -30,6 +30,7 @@ static inline int native_set_wallclock(unsigned long nowtime)
30 30
31#ifdef CONFIG_PARAVIRT 31#ifdef CONFIG_PARAVIRT
32#include <asm/paravirt.h> 32#include <asm/paravirt.h>
33extern unsigned long long native_sched_clock(void);
33#else /* !CONFIG_PARAVIRT */ 34#else /* !CONFIG_PARAVIRT */
34 35
35#define get_wallclock() native_get_wallclock() 36#define get_wallclock() native_get_wallclock()
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index d0ebd05f8516..4752c3a6a708 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -8,6 +8,9 @@ void setup_pit_timer(void);
8/* Modifiers for buggy PIT handling */ 8/* Modifiers for buggy PIT handling */
9extern int pit_latch_buggy; 9extern int pit_latch_buggy;
10extern int timer_ack; 10extern int timer_ack;
11extern int no_timer_check;
12extern unsigned long long (*custom_sched_clock)(void);
13extern int no_sync_cmos_clock;
11extern int recalibrate_cpu_khz(void); 14extern int recalibrate_cpu_khz(void);
12 15
13#endif 16#endif
diff --git a/include/asm-i386/vmi.h b/include/asm-i386/vmi.h
new file mode 100644
index 000000000000..43c89333037e
--- /dev/null
+++ b/include/asm-i386/vmi.h
@@ -0,0 +1,262 @@
1/*
2 * VMI interface definition
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Maintained by: Zachary Amsden zach@vmware.com
22 *
23 */
24#include <linux/types.h>
25
26/*
27 *---------------------------------------------------------------------
28 *
29 * VMI Option ROM API
30 *
31 *---------------------------------------------------------------------
32 */
33#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
34
35#define PCI_VENDOR_ID_VMWARE 0x15AD
36#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
37
38/*
39 * We use two version numbers for compatibility, with the major
40 * number signifying interface breakages, and the minor number
41 * interface extensions.
42 */
43#define VMI_API_REV_MAJOR 3
44#define VMI_API_REV_MINOR 0
45
46#define VMI_CALL_CPUID 0
47#define VMI_CALL_WRMSR 1
48#define VMI_CALL_RDMSR 2
49#define VMI_CALL_SetGDT 3
50#define VMI_CALL_SetLDT 4
51#define VMI_CALL_SetIDT 5
52#define VMI_CALL_SetTR 6
53#define VMI_CALL_GetGDT 7
54#define VMI_CALL_GetLDT 8
55#define VMI_CALL_GetIDT 9
56#define VMI_CALL_GetTR 10
57#define VMI_CALL_WriteGDTEntry 11
58#define VMI_CALL_WriteLDTEntry 12
59#define VMI_CALL_WriteIDTEntry 13
60#define VMI_CALL_UpdateKernelStack 14
61#define VMI_CALL_SetCR0 15
62#define VMI_CALL_SetCR2 16
63#define VMI_CALL_SetCR3 17
64#define VMI_CALL_SetCR4 18
65#define VMI_CALL_GetCR0 19
66#define VMI_CALL_GetCR2 20
67#define VMI_CALL_GetCR3 21
68#define VMI_CALL_GetCR4 22
69#define VMI_CALL_WBINVD 23
70#define VMI_CALL_SetDR 24
71#define VMI_CALL_GetDR 25
72#define VMI_CALL_RDPMC 26
73#define VMI_CALL_RDTSC 27
74#define VMI_CALL_CLTS 28
75#define VMI_CALL_EnableInterrupts 29
76#define VMI_CALL_DisableInterrupts 30
77#define VMI_CALL_GetInterruptMask 31
78#define VMI_CALL_SetInterruptMask 32
79#define VMI_CALL_IRET 33
80#define VMI_CALL_SYSEXIT 34
81#define VMI_CALL_Halt 35
82#define VMI_CALL_Reboot 36
83#define VMI_CALL_Shutdown 37
84#define VMI_CALL_SetPxE 38
85#define VMI_CALL_SetPxELong 39
86#define VMI_CALL_UpdatePxE 40
87#define VMI_CALL_UpdatePxELong 41
88#define VMI_CALL_MachineToPhysical 42
89#define VMI_CALL_PhysicalToMachine 43
90#define VMI_CALL_AllocatePage 44
91#define VMI_CALL_ReleasePage 45
92#define VMI_CALL_InvalPage 46
93#define VMI_CALL_FlushTLB 47
94#define VMI_CALL_SetLinearMapping 48
95
96#define VMI_CALL_SetIOPLMask 61
97#define VMI_CALL_SetInitialAPState 62
98#define VMI_CALL_APICWrite 63
99#define VMI_CALL_APICRead 64
100#define VMI_CALL_SetLazyMode 73
101
102/*
103 *---------------------------------------------------------------------
104 *
105 * MMU operation flags
106 *
107 *---------------------------------------------------------------------
108 */
109
110/* Flags used by VMI_{Allocate|Release}Page call */
111#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
112#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
113#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
114
115
116/* Flags shared by Allocate|Release Page and PTE updates */
117#define VMI_PAGE_PT 0x01
118#define VMI_PAGE_PD 0x02
119#define VMI_PAGE_PDP 0x04
120#define VMI_PAGE_PML4 0x08
121
122#define VMI_PAGE_NORMAL 0x00 /* for debugging */
123
124/* Flags used by PTE updates */
125#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
126#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
127#define VMI_PAGE_VA_MASK 0xfffff000
128
129#ifdef CONFIG_X86_PAE
130#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
131#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
132#else
133#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
134#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
135#endif
136
137/* Flags used by VMI_FlushTLB call */
138#define VMI_FLUSH_TLB 0x01
139#define VMI_FLUSH_GLOBAL 0x02
140
141/*
142 *---------------------------------------------------------------------
143 *
144 * VMI relocation definitions for ROM call get_reloc
145 *
146 *---------------------------------------------------------------------
147 */
148
149/* VMI Relocation types */
150#define VMI_RELOCATION_NONE 0
151#define VMI_RELOCATION_CALL_REL 1
152#define VMI_RELOCATION_JUMP_REL 2
153#define VMI_RELOCATION_NOP 3
154
155#ifndef __ASSEMBLY__
156struct vmi_relocation_info {
157 unsigned char *eip;
158 unsigned char type;
159 unsigned char reserved[3];
160};
161#endif
162
163
164/*
165 *---------------------------------------------------------------------
166 *
167 * Generic ROM structures and definitions
168 *
169 *---------------------------------------------------------------------
170 */
171
172#ifndef __ASSEMBLY__
173
174struct vrom_header {
175 u16 rom_signature; // option ROM signature
176 u8 rom_length; // ROM length in 512 byte chunks
177 u8 rom_entry[4]; // 16-bit code entry point
178 u8 rom_pad0; // 4-byte align pad
179 u32 vrom_signature; // VROM identification signature
180 u8 api_version_min;// Minor version of API
181 u8 api_version_maj;// Major version of API
182 u8 jump_slots; // Number of jump slots
183 u8 reserved1; // Reserved for expansion
184 u32 virtual_top; // Hypervisor virtual address start
185 u16 reserved2; // Reserved for expansion
186 u16 license_offs; // Offset to License string
187 u16 pci_header_offs;// Offset to PCI OPROM header
188 u16 pnp_header_offs;// Offset to PnP OPROM header
189 u32 rom_pad3; // PnP reserverd / VMI reserved
190 u8 reserved[96]; // Reserved for headers
191 char vmi_init[8]; // VMI_Init jump point
192 char get_reloc[8]; // VMI_GetRelocationInfo jump point
193} __attribute__((packed));
194
195struct pnp_header {
196 char sig[4];
197 char rev;
198 char size;
199 short next;
200 short res;
201 long devID;
202 unsigned short manufacturer_offset;
203 unsigned short product_offset;
204} __attribute__((packed));
205
206struct pci_header {
207 char sig[4];
208 short vendorID;
209 short deviceID;
210 short vpdData;
211 short size;
212 char rev;
213 char class;
214 char subclass;
215 char interface;
216 short chunks;
217 char rom_version_min;
218 char rom_version_maj;
219 char codetype;
220 char lastRom;
221 short reserved;
222} __attribute__((packed));
223
224/* Function prototypes for bootstrapping */
225extern void vmi_init(void);
226extern void vmi_bringup(void);
227extern void vmi_apply_boot_page_allocations(void);
228
229/* State needed to start an application processor in an SMP system. */
230struct vmi_ap_state {
231 u32 cr0;
232 u32 cr2;
233 u32 cr3;
234 u32 cr4;
235
236 u64 efer;
237
238 u32 eip;
239 u32 eflags;
240 u32 eax;
241 u32 ebx;
242 u32 ecx;
243 u32 edx;
244 u32 esp;
245 u32 ebp;
246 u32 esi;
247 u32 edi;
248 u16 cs;
249 u16 ss;
250 u16 ds;
251 u16 es;
252 u16 fs;
253 u16 gs;
254 u16 ldtr;
255
256 u16 gdtr_limit;
257 u32 gdtr_base;
258 u32 idtr_base;
259 u16 idtr_limit;
260};
261
262#endif
diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h
new file mode 100644
index 000000000000..c12931211007
--- /dev/null
+++ b/include/asm-i386/vmi_time.h
@@ -0,0 +1,103 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef __VMI_TIME_H
26#define __VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53
54#ifdef CONFIG_X86_LOCAL_APIC
55extern void __init vmi_timer_setup_boot_alarm(void);
56extern void __init vmi_timer_setup_secondary_alarm(void);
57extern void apic_vmi_timer_interrupt(void);
58#endif
59
60#ifdef CONFIG_NO_IDLE_HZ
61extern int vmi_stop_hz_timer(void);
62extern void vmi_account_time_restart_hz_timer(void);
63#endif
64
65/*
66 * When run under a hypervisor, a vcpu is always in one of three states:
67 * running, halted, or ready. The vcpu is in the 'running' state if it
68 * is executing. When the vcpu executes the halt interface, the vcpu
69 * enters the 'halted' state and remains halted until there is some work
70 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
71 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
72 * state (waiting for the hypervisor to reschedule it). Finally, at any
73 * time when the vcpu is not in the 'running' state nor the 'halted'
74 * state, it is in the 'ready' state.
75 *
76 * Real time is advances while the vcpu is 'running', 'ready', or
77 * 'halted'. Stolen time is the time in which the vcpu is in the
78 * 'ready' state. Available time is the remaining time -- the vcpu is
79 * either 'running' or 'halted'.
80 *
81 * All three views of time are accessible through the VMI cycle
82 * counters.
83 */
84
85/* The cycle counters. */
86#define VMI_CYCLES_REAL 0
87#define VMI_CYCLES_AVAILABLE 1
88#define VMI_CYCLES_STOLEN 2
89
90/* The alarm interface 'flags' bits */
91#define VMI_ALARM_COUNTERS 2
92
93#define VMI_ALARM_COUNTER_MASK 0x000000ff
94
95#define VMI_ALARM_WIRED_IRQ0 0x00000000
96#define VMI_ALARM_WIRED_LVTT 0x00010000
97
98#define VMI_ALARM_IS_ONESHOT 0x00000000
99#define VMI_ALARM_IS_PERIODIC 0x00000100
100
101#define CONFIG_VMI_ALARM_HZ 100
102
103#endif
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 8da9609070f4..d4dbbe5f7bd9 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -7,7 +7,7 @@
7 7
8#include <asm/alternative.h> 8#include <asm/alternative.h>
9 9
10#if __GNUC__ < 4 || __GNUC_MINOR__ < 1 10#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
11/* Technically wrong, but this avoids compilation errors on some gcc 11/* Technically wrong, but this avoids compilation errors on some gcc
12 versions. */ 12 versions. */
13#define ADDR "=m" (*(volatile long *) addr) 13#define ADDR "=m" (*(volatile long *) addr)
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index 49dbab09ef2b..d2af227f06d0 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -66,6 +66,9 @@ static inline int dma_mapping_error(dma_addr_t dma_addr)
66#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 66#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
67#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) 67#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
68 68
69#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
70#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
71
69extern void *dma_alloc_coherent(struct device *dev, size_t size, 72extern void *dma_alloc_coherent(struct device *dev, size_t size,
70 dma_addr_t *dma_handle, gfp_t gfp); 73 dma_addr_t *dma_handle, gfp_t gfp);
71extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr, 74extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index fa2086774105..6216fa3f2802 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void);
46extern void e820_print_map(char *who); 46extern void e820_print_map(char *who);
47extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); 47extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
48extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); 48extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
49extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
49 50
50extern void e820_setup_gap(void); 51extern void e820_setup_gap(void);
51extern void e820_register_active_regions(int nid, 52extern void e820_register_active_regions(int nid,
@@ -56,6 +57,7 @@ extern void finish_e820_parsing(void);
56extern struct e820map e820; 57extern struct e820map e820;
57 58
58extern unsigned ebda_addr, ebda_size; 59extern unsigned ebda_addr, ebda_size;
60extern unsigned long nodemap_addr, nodemap_size;
59#endif/*!__ASSEMBLY__*/ 61#endif/*!__ASSEMBLY__*/
60 62
61#endif/*__E820_HEADER*/ 63#endif/*__E820_HEADER*/
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 179cce755aa7..552df5f10a6d 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -91,7 +91,7 @@ extern void enable_8259A_irq(unsigned int irq);
91extern int i8259A_irq_pending(unsigned int irq); 91extern int i8259A_irq_pending(unsigned int irq);
92extern void make_8259A_irq(unsigned int irq); 92extern void make_8259A_irq(unsigned int irq);
93extern void init_8259A(int aeoi); 93extern void init_8259A(int aeoi);
94extern void FASTCALL(send_IPI_self(int vector)); 94extern void send_IPI_self(int vector);
95extern void init_VISWS_APIC_irqs(void); 95extern void init_VISWS_APIC_irqs(void);
96extern void setup_IO_APIC(void); 96extern void setup_IO_APIC(void);
97extern void disable_IO_APIC(void); 97extern void disable_IO_APIC(void);
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index f5d84bb7c948..de2cd9a2303a 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -100,7 +100,7 @@ __OUTS(l)
100 100
101#define IO_SPACE_LIMIT 0xffff 101#define IO_SPACE_LIMIT 0xffff
102 102
103#if defined(__KERNEL__) && __x86_64__ 103#if defined(__KERNEL__) && defined(__x86_64__)
104 104
105#include <linux/vmalloc.h> 105#include <linux/vmalloc.h>
106 106
diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h
index 561ecbfd4cb5..f4fb238c89f1 100644
--- a/include/asm-x86_64/io_apic.h
+++ b/include/asm-x86_64/io_apic.h
@@ -85,18 +85,8 @@ struct IO_APIC_route_entry {
85 mask : 1, /* 0: enabled, 1: disabled */ 85 mask : 1, /* 0: enabled, 1: disabled */
86 __reserved_2 : 15; 86 __reserved_2 : 15;
87 87
88 union { struct { __u32 88 __u32 __reserved_3 : 24,
89 __reserved_1 : 24, 89 dest : 8;
90 physical_dest : 4,
91 __reserved_2 : 4;
92 } physical;
93
94 struct { __u32
95 __reserved_1 : 24,
96 logical_dest : 8;
97 } logical;
98 } dest;
99
100} __attribute__ ((packed)); 90} __attribute__ ((packed));
101 91
102/* 92/*
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h
index 5a11146d6d9c..177e92b4019b 100644
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
103 103
104extern atomic_t mce_entry; 104extern atomic_t mce_entry;
105 105
106extern void do_machine_check(struct pt_regs *, long);
107
106#endif 108#endif
107 109
108#endif 110#endif
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index c38ebdf6f426..fb558fb1d211 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -11,24 +11,25 @@
11 11
12#include <asm/smp.h> 12#include <asm/smp.h>
13 13
14/* Should really switch to dynamic allocation at some point */
15#define NODEMAPSIZE 0x4fff
16
17/* Simple perfect hash to map physical addresses to node numbers */ 14/* Simple perfect hash to map physical addresses to node numbers */
18struct memnode { 15struct memnode {
19 int shift; 16 int shift;
20 u8 map[NODEMAPSIZE]; 17 unsigned int mapsize;
21} ____cacheline_aligned; 18 u8 *map;
19 u8 embedded_map[64-16];
20} ____cacheline_aligned; /* total size = 64 bytes */
22extern struct memnode memnode; 21extern struct memnode memnode;
23#define memnode_shift memnode.shift 22#define memnode_shift memnode.shift
24#define memnodemap memnode.map 23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25 25
26extern struct pglist_data *node_data[]; 26extern struct pglist_data *node_data[];
27 27
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{ 29{
30 unsigned nid; 30 unsigned nid;
31 VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); 31 VIRTUAL_BUG_ON(!memnodemap);
32 VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
32 nid = memnodemap[addr >> memnode_shift]; 33 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 34 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid; 35 return nid;
@@ -46,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
46extern int pfn_valid(unsigned long pfn); 47extern int pfn_valid(unsigned long pfn);
47#endif 48#endif
48 49
50#ifdef CONFIG_NUMA_EMU
51#define FAKE_NODE_MIN_SIZE (64*1024*1024)
52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul))
53#endif
54
49#endif 55#endif
50#endif 56#endif
diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h
index 16396b1de3e4..6c2949a3c677 100644
--- a/include/asm-x86_64/mutex.h
+++ b/include/asm-x86_64/mutex.h
@@ -21,7 +21,7 @@ do { \
21 unsigned long dummy; \ 21 unsigned long dummy; \
22 \ 22 \
23 typecheck(atomic_t *, v); \ 23 typecheck(atomic_t *, v); \
24 typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ 24 typecheck_fn(void (*)(atomic_t *), fail_fn); \
25 \ 25 \
26 __asm__ __volatile__( \ 26 __asm__ __volatile__( \
27 LOCK_PREFIX " decl (%%rdi) \n" \ 27 LOCK_PREFIX " decl (%%rdi) \n" \
@@ -47,7 +47,7 @@ do { \
47 */ 47 */
48static inline int 48static inline int
49__mutex_fastpath_lock_retval(atomic_t *count, 49__mutex_fastpath_lock_retval(atomic_t *count,
50 int fastcall (*fail_fn)(atomic_t *)) 50 int (*fail_fn)(atomic_t *))
51{ 51{
52 if (unlikely(atomic_dec_return(count) < 0)) 52 if (unlikely(atomic_dec_return(count) < 0))
53 return fail_fn(count); 53 return fail_fn(count);
@@ -67,7 +67,7 @@ do { \
67 unsigned long dummy; \ 67 unsigned long dummy; \
68 \ 68 \
69 typecheck(atomic_t *, v); \ 69 typecheck(atomic_t *, v); \
70 typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ 70 typecheck_fn(void (*)(atomic_t *), fail_fn); \
71 \ 71 \
72 __asm__ __volatile__( \ 72 __asm__ __volatile__( \
73 LOCK_PREFIX " incl (%%rdi) \n" \ 73 LOCK_PREFIX " incl (%%rdi) \n" \
diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index 43d4c333a8b1..4e28b6060a5e 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -18,11 +18,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p
18 set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); 18 set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
19} 19}
20 20
21static inline pmd_t *get_pmd(void)
22{
23 return (pmd_t *)get_zeroed_page(GFP_KERNEL);
24}
25
26static inline void pmd_free(pmd_t *pmd) 21static inline void pmd_free(pmd_t *pmd)
27{ 22{
28 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); 23 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 59901c690a0d..730bd6028416 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -359,15 +359,6 @@ static inline int pmd_large(pmd_t pte) {
359#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) 359#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
360#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) 360#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
361 361
362/* physical address -> PTE */
363static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
364{
365 pte_t pte;
366 pte_val(pte) = physpage | pgprot_val(pgprot);
367 pte_val(pte) &= __supported_pte_mask;
368 return pte;
369}
370
371/* Change flags of a PTE */ 362/* Change flags of a PTE */
372static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 363static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
373{ 364{
diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h
index 8079e29c14fd..1981f70fcad1 100644
--- a/include/asm-x86_64/uaccess.h
+++ b/include/asm-x86_64/uaccess.h
@@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
367 return copy_user_generic((__force void *)dst, src, size); 367 return copy_user_generic((__force void *)dst, src, size);
368} 368}
369 369
370#define ARCH_HAS_NOCACHE_UACCESS 1
371extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest);
372
373static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
374{
375 might_sleep();
376 return __copy_user_nocache(dst, (__force void *)src, size, 1);
377}
378
379static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size)
380{
381 return __copy_user_nocache(dst, (__force void *)src, size, 0);
382}
383
370#endif /* __X86_64_UACCESS_H */ 384#endif /* __X86_64_UACCESS_H */
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index 05cb8dd200de..0c7847165eae 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -56,11 +56,6 @@ extern struct vxtime_data vxtime;
56extern int vgetcpu_mode; 56extern int vgetcpu_mode;
57extern struct timezone sys_tz; 57extern struct timezone sys_tz;
58extern int sysctl_vsyscall; 58extern int sysctl_vsyscall;
59extern seqlock_t xtime_lock;
60
61extern int sysctl_vsyscall;
62
63#define ARCH_HAVE_XTIME_LOCK 1
64 59
65#endif /* __KERNEL__ */ 60#endif /* __KERNEL__ */
66 61
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index c1e82c514443..2d956cd566ae 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -59,6 +59,7 @@ struct linux_binfmt {
59 int (*load_shlib)(struct file *); 59 int (*load_shlib)(struct file *);
60 int (*core_dump)(long signr, struct pt_regs * regs, struct file * file); 60 int (*core_dump)(long signr, struct pt_regs * regs, struct file * file);
61 unsigned long min_coredump; /* minimal dump size */ 61 unsigned long min_coredump; /* minimal dump size */
62 int hasvdso;
62}; 63};
63 64
64extern int register_binfmt(struct linux_binfmt *); 65extern int register_binfmt(struct linux_binfmt *);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index acb4ed130247..29af2d5df097 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,8 +17,15 @@
17#ifdef ARCH_HAS_NMI_WATCHDOG 17#ifdef ARCH_HAS_NMI_WATCHDOG
18#include <asm/nmi.h> 18#include <asm/nmi.h>
19extern void touch_nmi_watchdog(void); 19extern void touch_nmi_watchdog(void);
20extern void acpi_nmi_disable(void);
21extern void acpi_nmi_enable(void);
20#else 22#else
21# define touch_nmi_watchdog() touch_softlockup_watchdog() 23static inline void touch_nmi_watchdog(void)
24{
25 touch_softlockup_watchdog();
26}
27static inline void acpi_nmi_disable(void) { }
28static inline void acpi_nmi_enable(void) { }
22#endif 29#endif
23 30
24#ifndef trigger_all_cpu_backtrace 31#ifndef trigger_all_cpu_backtrace
diff --git a/include/linux/time.h b/include/linux/time.h
index 55cee172d723..eceb1a59b078 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -90,7 +90,7 @@ static inline struct timespec timespec_sub(struct timespec lhs,
90 90
91extern struct timespec xtime; 91extern struct timespec xtime;
92extern struct timespec wall_to_monotonic; 92extern struct timespec wall_to_monotonic;
93extern seqlock_t xtime_lock; 93extern seqlock_t xtime_lock __attribute__((weak));
94 94
95void timekeeping_init(void); 95void timekeeping_init(void);
96 96
diff --git a/init/main.c b/init/main.c
index 649ab5443d43..2421e1544127 100644
--- a/init/main.c
+++ b/init/main.c
@@ -726,7 +726,49 @@ static void run_init_process(char *init_filename)
726 kernel_execve(init_filename, argv_init, envp_init); 726 kernel_execve(init_filename, argv_init, envp_init);
727} 727}
728 728
729static int init(void * unused) 729/* This is a non __init function. Force it to be noinline otherwise gcc
730 * makes it inline to init() and it becomes part of init.text section
731 */
732static int noinline init_post(void)
733{
734 free_initmem();
735 unlock_kernel();
736 mark_rodata_ro();
737 system_state = SYSTEM_RUNNING;
738 numa_default_policy();
739
740 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
741 printk(KERN_WARNING "Warning: unable to open an initial console.\n");
742
743 (void) sys_dup(0);
744 (void) sys_dup(0);
745
746 if (ramdisk_execute_command) {
747 run_init_process(ramdisk_execute_command);
748 printk(KERN_WARNING "Failed to execute %s\n",
749 ramdisk_execute_command);
750 }
751
752 /*
753 * We try each of these until one succeeds.
754 *
755 * The Bourne shell can be used instead of init if we are
756 * trying to recover a really broken machine.
757 */
758 if (execute_command) {
759 run_init_process(execute_command);
760 printk(KERN_WARNING "Failed to execute %s. Attempting "
761 "defaults...\n", execute_command);
762 }
763 run_init_process("/sbin/init");
764 run_init_process("/etc/init");
765 run_init_process("/bin/init");
766 run_init_process("/bin/sh");
767
768 panic("No init found. Try passing init= option to kernel.");
769}
770
771static int __init init(void * unused)
730{ 772{
731 lock_kernel(); 773 lock_kernel();
732 /* 774 /*
@@ -774,39 +816,6 @@ static int init(void * unused)
774 * we're essentially up and running. Get rid of the 816 * we're essentially up and running. Get rid of the
775 * initmem segments and start the user-mode stuff.. 817 * initmem segments and start the user-mode stuff..
776 */ 818 */
777 free_initmem(); 819 init_post();
778 unlock_kernel(); 820 return 0;
779 mark_rodata_ro();
780 system_state = SYSTEM_RUNNING;
781 numa_default_policy();
782
783 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
784 printk(KERN_WARNING "Warning: unable to open an initial console.\n");
785
786 (void) sys_dup(0);
787 (void) sys_dup(0);
788
789 if (ramdisk_execute_command) {
790 run_init_process(ramdisk_execute_command);
791 printk(KERN_WARNING "Failed to execute %s\n",
792 ramdisk_execute_command);
793 }
794
795 /*
796 * We try each of these until one succeeds.
797 *
798 * The Bourne shell can be used instead of init if we are
799 * trying to recover a really broken machine.
800 */
801 if (execute_command) {
802 run_init_process(execute_command);
803 printk(KERN_WARNING "Failed to execute %s. Attempting "
804 "defaults...\n", execute_command);
805 }
806 run_init_process("/sbin/init");
807 run_init_process("/etc/init");
808 run_init_process("/bin/init");
809 run_init_process("/bin/sh");
810
811 panic("No init found. Try passing init= option to kernel.");
812} 821}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..796276141e51 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
217 sub_info->retval = ret; 217 sub_info->retval = ret;
218 } 218 }
219 219
220 complete(sub_info->complete); 220 if (sub_info->wait < 0)
221 kfree(sub_info);
222 else
223 complete(sub_info->complete);
221 return 0; 224 return 0;
222} 225}
223 226
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
239 pid = kernel_thread(____call_usermodehelper, sub_info, 242 pid = kernel_thread(____call_usermodehelper, sub_info,
240 CLONE_VFORK | SIGCHLD); 243 CLONE_VFORK | SIGCHLD);
241 244
245 if (wait < 0)
246 return;
247
242 if (pid < 0) { 248 if (pid < 0) {
243 sub_info->retval = pid; 249 sub_info->retval = pid;
244 complete(sub_info->complete); 250 complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
253 * @envp: null-terminated environment list 259 * @envp: null-terminated environment list
254 * @session_keyring: session keyring for process (NULL for an empty keyring) 260 * @session_keyring: session keyring for process (NULL for an empty keyring)
255 * @wait: wait for the application to finish and return status. 261 * @wait: wait for the application to finish and return status.
262 * when -1 don't wait at all, but you get no useful error back when
263 * the program couldn't be exec'ed. This makes it safe to call
264 * from interrupt context.
256 * 265 *
257 * Runs a user-space application. The application is started 266 * Runs a user-space application. The application is started
258 * asynchronously if wait is not set, and runs as a child of keventd. 267 * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
265 struct key *session_keyring, int wait) 274 struct key *session_keyring, int wait)
266{ 275{
267 DECLARE_COMPLETION_ONSTACK(done); 276 DECLARE_COMPLETION_ONSTACK(done);
268 struct subprocess_info sub_info = { 277 struct subprocess_info *sub_info;
269 .work = __WORK_INITIALIZER(sub_info.work, 278 int retval;
270 __call_usermodehelper),
271 .complete = &done,
272 .path = path,
273 .argv = argv,
274 .envp = envp,
275 .ring = session_keyring,
276 .wait = wait,
277 .retval = 0,
278 };
279 279
280 if (!khelper_wq) 280 if (!khelper_wq)
281 return -EBUSY; 281 return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
283 if (path[0] == '\0') 283 if (path[0] == '\0')
284 return 0; 284 return 0;
285 285
286 queue_work(khelper_wq, &sub_info.work); 286 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
287 if (!sub_info)
288 return -ENOMEM;
289
290 INIT_WORK(&sub_info->work, __call_usermodehelper);
291 sub_info->complete = &done;
292 sub_info->path = path;
293 sub_info->argv = argv;
294 sub_info->envp = envp;
295 sub_info->ring = session_keyring;
296 sub_info->wait = wait;
297
298 queue_work(khelper_wq, &sub_info->work);
299 if (wait < 0) /* task has freed sub_info */
300 return 0;
287 wait_for_completion(&done); 301 wait_for_completion(&done);
288 return sub_info.retval; 302 retval = sub_info->retval;
303 kfree(sub_info);
304 return retval;
289} 305}
290EXPORT_SYMBOL(call_usermodehelper_keys); 306EXPORT_SYMBOL(call_usermodehelper_keys);
291 307
diff --git a/kernel/sched.c b/kernel/sched.c
index 08f86178aa34..0dc757246d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1853 struct mm_struct *mm = next->mm; 1853 struct mm_struct *mm = next->mm;
1854 struct mm_struct *oldmm = prev->active_mm; 1854 struct mm_struct *oldmm = prev->active_mm;
1855 1855
1856 /*
1857 * For paravirt, this is coupled with an exit in switch_to to
1858 * combine the page table reload and the switch backend into
1859 * one hypercall.
1860 */
1861 arch_enter_lazy_cpu_mode();
1862
1856 if (!mm) { 1863 if (!mm) {
1857 next->active_mm = oldmm; 1864 next->active_mm = oldmm;
1858 atomic_inc(&oldmm->mm_count); 1865 atomic_inc(&oldmm->mm_count);
diff --git a/kernel/timer.c b/kernel/timer.c
index 8533c3796082..4902181e10e6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1162,11 +1162,9 @@ static inline void calc_load(unsigned long ticks)
1162 * This read-write spinlock protects us from races in SMP while 1162 * This read-write spinlock protects us from races in SMP while
1163 * playing with xtime and avenrun. 1163 * playing with xtime and avenrun.
1164 */ 1164 */
1165#ifndef ARCH_HAVE_XTIME_LOCK 1165__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1166__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1167 1166
1168EXPORT_SYMBOL(xtime_lock); 1167EXPORT_SYMBOL(xtime_lock);
1169#endif
1170 1168
1171/* 1169/*
1172 * This function runs timers and the timer-tq in bottom half context. 1170 * This function runs timers and the timer-tq in bottom half context.
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 2aa47623f5f8..569e68410d7a 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -641,12 +641,20 @@ static int secref_whitelist(const char *modname, const char *tosec,
641 if (f1 && f2) 641 if (f1 && f2)
642 return 1; 642 return 1;
643 643
644 /* Whitelist all references from .pci_fixup section if vmlinux */ 644 /* Whitelist all references from .pci_fixup section if vmlinux
645 * Whitelist all refereces from .text.head to .init.data if vmlinux
646 * Whitelist all refereces from .text.head to .init.text if vmlinux
647 */
645 if (is_vmlinux(modname)) { 648 if (is_vmlinux(modname)) {
646 if ((strcmp(fromsec, ".pci_fixup") == 0) && 649 if ((strcmp(fromsec, ".pci_fixup") == 0) &&
647 (strcmp(tosec, ".init.text") == 0)) 650 (strcmp(tosec, ".init.text") == 0))
648 return 1; 651 return 1;
649 652
653 if ((strcmp(fromsec, ".text.head") == 0) &&
654 ((strcmp(tosec, ".init.data") == 0) ||
655 (strcmp(tosec, ".init.text") == 0)))
656 return 1;
657
650 /* Check for pattern 3 */ 658 /* Check for pattern 3 */
651 for (s = pat3refsym; *s; s++) 659 for (s = pat3refsym; *s; s++)
652 if (strcmp(refsymname, *s) == 0) 660 if (strcmp(refsymname, *s) == 0)