aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-08-14 06:19:59 -0400
committerIngo Molnar <mingo@elte.hu>2008-08-14 06:19:59 -0400
commit8d7ccaa545490cdffdfaff0842436a8dd85cf47b (patch)
tree8129b5907161bc6ae26deb3645ce1e280c5e1f51 /arch/x86
parentb2139aa0eec330c711c5a279db361e5ef1178e78 (diff)
parent30a2f3c60a84092c8084dfe788b710f8d0768cd4 (diff)
Merge commit 'v2.6.27-rc3' into x86/prototypes
Conflicts: include/asm-x86/dma-mapping.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig36
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/compressed/misc.c39
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/ia32/ia32entry.S97
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/amd_iommu.c17
-rw-r--r--arch/x86/kernel/amd_iommu_init.c4
-rw-r--r--arch/x86/kernel/apic_32.c14
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/cpu/bugs.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c120
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c149
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/entry_32.S55
-rw-r--r--arch/x86/kernel/entry_64.S55
-rw-r--r--arch/x86/kernel/genapic_64.c1
-rw-r--r--arch/x86/kernel/genapic_flat_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/hpet.c10
-rw-r--r--arch/x86/kernel/io_apic_32.c6
-rw-r--r--arch/x86/kernel/io_apic_64.c37
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/kprobes.c6
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c39
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/microcode.c14
-rw-r--r--arch/x86/kernel/module_64.c1
-rw-r--r--arch/x86/kernel/mpparse.c11
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c158
-rw-r--r--arch/x86/kernel/pci-dma.c163
-rw-r--r--arch/x86/kernel/pci-gart_64.c18
-rw-r--r--arch/x86/kernel/pci-nommu.c14
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/reboot.c3
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S174
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/setup_percpu.c21
-rw-r--r--arch/x86/kernel/signal_32.c3
-rw-r--r--arch/x86/kernel/signal_64.c56
-rw-r--r--arch/x86/kernel/smpboot.c30
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/vmi_32.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/mmu.c107
-rw-r--r--arch/x86/kvm/paging_tmpl.h12
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/vmx.c22
-rw-r--r--arch/x86/kvm/x86.c133
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S3
-rw-r--r--arch/x86/mach-es7000/es7000plat.c8
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/discontig_32.c3
-rw-r--r--arch/x86/mm/dump_pagetables.c10
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_64.c37
-rw-r--r--arch/x86/mm/ioremap.c8
-rw-r--r--arch/x86/mm/numa_64.c4
-rw-r--r--arch/x86/mm/pgtable.c3
-rw-r--r--arch/x86/mm/pgtable_32.c47
-rw-r--r--arch/x86/oprofile/nmi_int.c36
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c27
-rw-r--r--arch/x86/pci/irq.c106
-rw-r--r--arch/x86/pci/numaq_32.c5
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/mmu.c8
-rw-r--r--arch/x86/xen/smp.c4
-rw-r--r--arch/x86/xen/xen-asm_64.S2
91 files changed, 1631 insertions, 878 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 03980cb04291..ac2fb0641a04 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,12 +21,16 @@ config X86
21 select HAVE_UNSTABLE_SCHED_CLOCK 21 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 22 select HAVE_IDE
23 select HAVE_OPROFILE 23 select HAVE_OPROFILE
24 select HAVE_IOREMAP_PROT
24 select HAVE_KPROBES 25 select HAVE_KPROBES
26 select ARCH_WANT_OPTIONAL_GPIOLIB
25 select HAVE_KRETPROBES 27 select HAVE_KRETPROBES
26 select HAVE_DYNAMIC_FTRACE 28 select HAVE_DYNAMIC_FTRACE
27 select HAVE_FTRACE 29 select HAVE_FTRACE
28 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
29 select HAVE_ARCH_KGDB if !X86_VOYAGER 31 select HAVE_ARCH_KGDB if !X86_VOYAGER
32 select HAVE_GENERIC_DMA_COHERENT if X86_32
33 select HAVE_EFFICIENT_UNALIGNED_ACCESS
30 34
31config ARCH_DEFCONFIG 35config ARCH_DEFCONFIG
32 string 36 string
@@ -329,20 +333,6 @@ config X86_BIGSMP
329 333
330endif 334endif
331 335
332config X86_RDC321X
333 bool "RDC R-321x SoC"
334 depends on X86_32
335 select M486
336 select X86_REBOOTFIXUPS
337 select GENERIC_GPIO
338 select LEDS_CLASS
339 select LEDS_GPIO
340 select NEW_LEDS
341 help
342 This option is needed for RDC R-321x system-on-chip, also known
343 as R-8610-(G).
344 If you don't have one of these chips, you should say N here.
345
346config X86_VSMP 336config X86_VSMP
347 bool "Support for ScaleMP vSMP" 337 bool "Support for ScaleMP vSMP"
348 select PARAVIRT 338 select PARAVIRT
@@ -366,6 +356,16 @@ config X86_VISWS
366 A kernel compiled for the Visual Workstation will run on general 356 A kernel compiled for the Visual Workstation will run on general
367 PCs as well. See <file:Documentation/sgi-visws.txt> for details. 357 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
368 358
359config X86_RDC321X
360 bool "RDC R-321x SoC"
361 depends on X86_32
362 select M486
363 select X86_REBOOTFIXUPS
364 help
365 This option is needed for RDC R-321x system-on-chip, also known
366 as R-8610-(G).
367 If you don't have one of these chips, you should say N here.
368
369config SCHED_NO_NO_OMIT_FRAME_POINTER 369config SCHED_NO_NO_OMIT_FRAME_POINTER
370 def_bool y 370 def_bool y
371 prompt "Single-depth WCHAN output" 371 prompt "Single-depth WCHAN output"
@@ -1276,6 +1276,14 @@ config CRASH_DUMP
1276 (CONFIG_RELOCATABLE=y). 1276 (CONFIG_RELOCATABLE=y).
1277 For more details see Documentation/kdump/kdump.txt 1277 For more details see Documentation/kdump/kdump.txt
1278 1278
1279config KEXEC_JUMP
1280 bool "kexec jump (EXPERIMENTAL)"
1281 depends on EXPERIMENTAL
1282 depends on KEXEC && HIBERNATION && X86_32
1283 help
1284 Jump between original kernel and kexeced kernel and invoke
1285 code in physical address mode via KEXEC
1286
1279config PHYSICAL_START 1287config PHYSICAL_START
1280 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1288 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1281 default "0x1000000" if X86_NUMAQ 1289 default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 54b8c02c71e6..2c518fbc52ec 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -414,4 +414,4 @@ config X86_MINIMUM_CPU_FAMILY
414 414
415config X86_DEBUGCTLMSR 415config X86_DEBUGCTLMSR
416 def_bool y 416 def_bool y
417 depends on !(M586MMX || M586TSC || M586 || M486 || M386) 417 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 919ce21ea654..f5631da585b6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,11 +118,6 @@ mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120 120
121# RDC R-321x subarch support
122mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
123mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
124core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
125
126# default subarch .h files 121# default subarch .h files
127mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iinclude/asm-x86/mach-default
128 123
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index bc5553b496f7..9fea73706479 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -182,8 +182,6 @@ static unsigned outcnt;
182static int fill_inbuf(void); 182static int fill_inbuf(void);
183static void flush_window(void); 183static void flush_window(void);
184static void error(char *m); 184static void error(char *m);
185static void gzip_mark(void **);
186static void gzip_release(void **);
187 185
188/* 186/*
189 * This is set up by the setup-routine at boot-time 187 * This is set up by the setup-routine at boot-time
@@ -196,9 +194,6 @@ extern int input_len;
196 194
197static long bytes_out; 195static long bytes_out;
198 196
199static void *malloc(int size);
200static void free(void *where);
201
202static void *memset(void *s, int c, unsigned n); 197static void *memset(void *s, int c, unsigned n);
203static void *memcpy(void *dest, const void *src, unsigned n); 198static void *memcpy(void *dest, const void *src, unsigned n);
204 199
@@ -220,40 +215,6 @@ static int lines, cols;
220 215
221#include "../../../../lib/inflate.c" 216#include "../../../../lib/inflate.c"
222 217
223static void *malloc(int size)
224{
225 void *p;
226
227 if (size < 0)
228 error("Malloc error");
229 if (free_mem_ptr <= 0)
230 error("Memory error");
231
232 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
233
234 p = (void *)free_mem_ptr;
235 free_mem_ptr += size;
236
237 if (free_mem_ptr >= free_mem_end_ptr)
238 error("Out of memory");
239
240 return p;
241}
242
243static void free(void *where)
244{ /* Don't care */
245}
246
247static void gzip_mark(void **ptr)
248{
249 *ptr = (void *) free_mem_ptr;
250}
251
252static void gzip_release(void **ptr)
253{
254 free_mem_ptr = (memptr) *ptr;
255}
256
257static void scroll(void) 218static void scroll(void)
258{ 219{
259 int i; 220 int i;
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..a0e1dbe67dc1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 441 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 443 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 444 return 0;
451} 445}
452 446
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 23d146ce676b..ffc1bb4fed7d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -15,6 +15,16 @@
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h>
20#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
21#define __AUDIT_ARCH_LE 0x40000000
22
23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call
26#endif
27
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19 29
20 .macro IA32_ARG_FIXUP noebp=0 30 .macro IA32_ARG_FIXUP noebp=0
@@ -148,13 +158,15 @@ ENTRY(ia32_sysenter_target)
148 ja ia32_badsys 158 ja ia32_badsys
149sysenter_do_call: 159sysenter_do_call:
150 IA32_ARG_FIXUP 1 160 IA32_ARG_FIXUP 1
161sysenter_dispatch:
151 call *ia32_sys_call_table(,%rax,8) 162 call *ia32_sys_call_table(,%rax,8)
152 movq %rax,RAX-ARGOFFSET(%rsp) 163 movq %rax,RAX-ARGOFFSET(%rsp)
153 GET_THREAD_INFO(%r10) 164 GET_THREAD_INFO(%r10)
154 DISABLE_INTERRUPTS(CLBR_NONE) 165 DISABLE_INTERRUPTS(CLBR_NONE)
155 TRACE_IRQS_OFF 166 TRACE_IRQS_OFF
156 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 167 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
157 jnz int_ret_from_sys_call 168 jnz sysexit_audit
169sysexit_from_sys_call:
158 andl $~TS_COMPAT,TI_status(%r10) 170 andl $~TS_COMPAT,TI_status(%r10)
159 /* clear IF, that popfq doesn't enable interrupts early */ 171 /* clear IF, that popfq doesn't enable interrupts early */
160 andl $~0x200,EFLAGS-R11(%rsp) 172 andl $~0x200,EFLAGS-R11(%rsp)
@@ -170,9 +182,63 @@ sysenter_do_call:
170 TRACE_IRQS_ON 182 TRACE_IRQS_ON
171 ENABLE_INTERRUPTS_SYSEXIT32 183 ENABLE_INTERRUPTS_SYSEXIT32
172 184
173sysenter_tracesys: 185#ifdef CONFIG_AUDITSYSCALL
186 .macro auditsys_entry_common
187 movl %esi,%r9d /* 6th arg: 4th syscall arg */
188 movl %edx,%r8d /* 5th arg: 3rd syscall arg */
189 /* (already in %ecx) 4th arg: 2nd syscall arg */
190 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
191 movl %eax,%esi /* 2nd arg: syscall number */
192 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
193 call audit_syscall_entry
194 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
195 cmpl $(IA32_NR_syscalls-1),%eax
196 ja ia32_badsys
197 movl %ebx,%edi /* reload 1st syscall arg */
198 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
199 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
200 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
201 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
202 .endm
203
204 .macro auditsys_exit exit,ebpsave=RBP
205 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
206 jnz int_ret_from_sys_call
207 TRACE_IRQS_ON
208 sti
209 movl %eax,%esi /* second arg, syscall return value */
210 cmpl $0,%eax /* is it < 0? */
211 setl %al /* 1 if so, 0 if not */
212 movzbl %al,%edi /* zero-extend that into %edi */
213 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
214 call audit_syscall_exit
215 GET_THREAD_INFO(%r10)
216 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
217 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
218 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
219 cli
220 TRACE_IRQS_OFF
221 testl %edi,TI_flags(%r10)
222 jnz int_with_check
223 jmp \exit
224 .endm
225
226sysenter_auditsys:
174 CFI_RESTORE_STATE 227 CFI_RESTORE_STATE
228 auditsys_entry_common
229 movl %ebp,%r9d /* reload 6th syscall arg */
230 jmp sysenter_dispatch
231
232sysexit_audit:
233 auditsys_exit sysexit_from_sys_call
234#endif
235
236sysenter_tracesys:
175 xchgl %r9d,%ebp 237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys
241#endif
176 SAVE_REST 242 SAVE_REST
177 CLEAR_RREGS 243 CLEAR_RREGS
178 movq %r9,R9(%rsp) 244 movq %r9,R9(%rsp)
@@ -252,13 +318,15 @@ cstar_do_call:
252 cmpl $IA32_NR_syscalls-1,%eax 318 cmpl $IA32_NR_syscalls-1,%eax
253 ja ia32_badsys 319 ja ia32_badsys
254 IA32_ARG_FIXUP 1 320 IA32_ARG_FIXUP 1
321cstar_dispatch:
255 call *ia32_sys_call_table(,%rax,8) 322 call *ia32_sys_call_table(,%rax,8)
256 movq %rax,RAX-ARGOFFSET(%rsp) 323 movq %rax,RAX-ARGOFFSET(%rsp)
257 GET_THREAD_INFO(%r10) 324 GET_THREAD_INFO(%r10)
258 DISABLE_INTERRUPTS(CLBR_NONE) 325 DISABLE_INTERRUPTS(CLBR_NONE)
259 TRACE_IRQS_OFF 326 TRACE_IRQS_OFF
260 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 327 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
261 jnz int_ret_from_sys_call 328 jnz sysretl_audit
329sysretl_from_sys_call:
262 andl $~TS_COMPAT,TI_status(%r10) 330 andl $~TS_COMPAT,TI_status(%r10)
263 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 331 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
264 movl RIP-ARGOFFSET(%rsp),%ecx 332 movl RIP-ARGOFFSET(%rsp),%ecx
@@ -270,8 +338,23 @@ cstar_do_call:
270 CFI_RESTORE rsp 338 CFI_RESTORE rsp
271 USERGS_SYSRET32 339 USERGS_SYSRET32
272 340
273cstar_tracesys: 341#ifdef CONFIG_AUDITSYSCALL
342cstar_auditsys:
274 CFI_RESTORE_STATE 343 CFI_RESTORE_STATE
344 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
345 auditsys_entry_common
346 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
347 jmp cstar_dispatch
348
349sysretl_audit:
350 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
351#endif
352
353cstar_tracesys:
354#ifdef CONFIG_AUDITSYSCALL
355 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
356 jz cstar_auditsys
357#endif
275 xchgl %r9d,%ebp 358 xchgl %r9d,%ebp
276 SAVE_REST 359 SAVE_REST
277 CLEAR_RREGS 360 CLEAR_RREGS
@@ -743,4 +826,10 @@ ia32_sys_call_table:
743 .quad sys32_fallocate 826 .quad sys32_fallocate
744 .quad compat_sys_timerfd_settime /* 325 */ 827 .quad compat_sys_timerfd_settime /* 325 */
745 .quad compat_sys_timerfd_gettime 828 .quad compat_sys_timerfd_gettime
829 .quad compat_sys_signalfd4
830 .quad sys_eventfd2
831 .quad sys_epoll_create1
832 .quad sys_dup3 /* 330 */
833 .quad sys_pipe2
834 .quad sys_inotify_init1
746ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e67..d3c64088b981 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd)
238 int retval; 238 int retval;
239 int fds[2]; 239 int fds[2];
240 240
241 retval = do_pipe(fds); 241 retval = do_pipe_flags(fds, 0);
242 if (retval) 242 if (retval)
243 goto out; 243 goto out;
244 if (copy_to_user(fd, fds, sizeof(fds))) 244 if (copy_to_user(fd, fds, sizeof(fds)))
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a3ddad18aaa3..fa2161d5003b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -150,6 +150,10 @@ static int __init acpi_sleep_setup(char *str)
150 acpi_realmode_flags |= 2; 150 acpi_realmode_flags |= 2;
151 if (strncmp(str, "s3_beep", 7) == 0) 151 if (strncmp(str, "s3_beep", 7) == 0)
152 acpi_realmode_flags |= 4; 152 acpi_realmode_flags |= 4;
153#ifdef CONFIG_HIBERNATION
154 if (strncmp(str, "s4_nohwsig", 10) == 0)
155 acpi_no_s4_hw_signature();
156#endif
153 if (strncmp(str, "old_ordering", 12) == 0) 157 if (strncmp(str, "old_ordering", 12) == 0)
154 acpi_old_suspend_ordering(); 158 acpi_old_suspend_ordering();
155 str = strchr(str, ','); 159 str = strchr(str, ',');
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c25210e6ac88..22d7d050905d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -29,9 +29,6 @@
29 29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31 31
32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34
35#define EXIT_LOOP_COUNT 10000000 32#define EXIT_LOOP_COUNT 10000000
36 33
37static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
@@ -185,7 +182,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
185 u64 address, size_t size) 182 u64 address, size_t size)
186{ 183{
187 int s = 0; 184 int s = 0;
188 unsigned pages = to_pages(address, size); 185 unsigned pages = iommu_num_pages(address, size);
189 186
190 address &= PAGE_MASK; 187 address &= PAGE_MASK;
191 188
@@ -557,8 +554,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
557 if (iommu->exclusion_start && 554 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) { 555 iommu->exclusion_start < dma_dom->aperture_size) {
559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 556 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
560 int pages = to_pages(iommu->exclusion_start, 557 int pages = iommu_num_pages(iommu->exclusion_start,
561 iommu->exclusion_length); 558 iommu->exclusion_length);
562 dma_ops_reserve_addresses(dma_dom, startpage, pages); 559 dma_ops_reserve_addresses(dma_dom, startpage, pages);
563 } 560 }
564 561
@@ -667,7 +664,7 @@ static int get_device_resources(struct device *dev,
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 664 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668 665
669 /* device not translated by any IOMMU in the system? */ 666 /* device not translated by any IOMMU in the system? */
670 if (_bdf >= amd_iommu_last_bdf) { 667 if (_bdf > amd_iommu_last_bdf) {
671 *iommu = NULL; 668 *iommu = NULL;
672 *domain = NULL; 669 *domain = NULL;
673 *bdf = 0xffff; 670 *bdf = 0xffff;
@@ -767,7 +764,7 @@ static dma_addr_t __map_single(struct device *dev,
767 unsigned int pages; 764 unsigned int pages;
768 int i; 765 int i;
769 766
770 pages = to_pages(paddr, size); 767 pages = iommu_num_pages(paddr, size);
771 paddr &= PAGE_MASK; 768 paddr &= PAGE_MASK;
772 769
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 770 address = dma_ops_alloc_addresses(dev, dma_dom, pages);
@@ -802,7 +799,7 @@ static void __unmap_single(struct amd_iommu *iommu,
802 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 799 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
803 return; 800 return;
804 801
805 pages = to_pages(dma_addr, size); 802 pages = iommu_num_pages(dma_addr, size);
806 dma_addr &= PAGE_MASK; 803 dma_addr &= PAGE_MASK;
807 start = dma_addr; 804 start = dma_addr;
808 805
@@ -1085,7 +1082,7 @@ void prealloc_protection_domains(void)
1085 1082
1086 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1083 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1087 devid = (dev->bus->number << 8) | dev->devfn; 1084 devid = (dev->bus->number << 8) | dev->devfn;
1088 if (devid >= amd_iommu_last_bdf) 1085 if (devid > amd_iommu_last_bdf)
1089 continue; 1086 continue;
1090 devid = amd_iommu_alias_table[devid]; 1087 devid = amd_iommu_alias_table[devid];
1091 if (domain_for_device(devid)) 1088 if (domain_for_device(devid))
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c9d8ff2eb130..d9a9da597e79 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -732,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
732 set_device_exclusion_range(m->devid, m); 732 set_device_exclusion_range(m->devid, m);
733 break; 733 break;
734 case ACPI_IVMD_TYPE_ALL: 734 case ACPI_IVMD_TYPE_ALL:
735 for (i = 0; i < amd_iommu_last_bdf; ++i) 735 for (i = 0; i <= amd_iommu_last_bdf; ++i)
736 set_device_exclusion_range(i, m); 736 set_device_exclusion_range(i, m);
737 break; 737 break;
738 case ACPI_IVMD_TYPE_RANGE: 738 case ACPI_IVMD_TYPE_RANGE:
@@ -934,7 +934,7 @@ int __init amd_iommu_init(void)
934 /* 934 /*
935 * let all alias entries point to itself 935 * let all alias entries point to itself
936 */ 936 */
937 for (i = 0; i < amd_iommu_last_bdf; ++i) 937 for (i = 0; i <= amd_iommu_last_bdf; ++i)
938 amd_iommu_alias_table[i] = i; 938 amd_iommu_alias_table[i] = i;
939 939
940 /* 940 /*
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index d6c898358371..039a8d4aaf62 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1720,15 +1720,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1720} 1720}
1721early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1721early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1722 1722
1723static int __init apic_set_verbosity(char *str) 1723static int __init apic_set_verbosity(char *arg)
1724{ 1724{
1725 if (strcmp("debug", str) == 0) 1725 if (!arg)
1726 return -EINVAL;
1727
1728 if (strcmp(arg, "debug") == 0)
1726 apic_verbosity = APIC_DEBUG; 1729 apic_verbosity = APIC_DEBUG;
1727 else if (strcmp("verbose", str) == 0) 1730 else if (strcmp(arg, "verbose") == 0)
1728 apic_verbosity = APIC_VERBOSE; 1731 apic_verbosity = APIC_VERBOSE;
1729 return 1; 1732
1733 return 0;
1730} 1734}
1731__setup("apic=", apic_set_verbosity); 1735early_param("apic", apic_set_verbosity);
1732 1736
1733static int __init lapic_insert_resource(void) 1737static int __init lapic_insert_resource(void)
1734{ 1738{
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..9ee24e6bc4b0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
219#include <linux/time.h> 219#include <linux/time.h>
220#include <linux/sched.h> 220#include <linux/sched.h>
221#include <linux/pm.h> 221#include <linux/pm.h>
222#include <linux/pm_legacy.h>
223#include <linux/capability.h> 222#include <linux/capability.h>
224#include <linux/device.h> 223#include <linux/device.h>
225#include <linux/kernel.h> 224#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c9b58a806e85..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
50 */ 50 */
51static void __init check_fpu(void) 51static void __init check_fpu(void)
52{ 52{
53 s32 fdiv_bug;
54
53 if (!boot_cpu_data.hard_math) { 55 if (!boot_cpu_data.hard_math) {
54#ifndef CONFIG_MATH_EMULATION 56#ifndef CONFIG_MATH_EMULATION
55 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 57 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
74 "fistpl %0\n\t" 76 "fistpl %0\n\t"
75 "fwait\n\t" 77 "fwait\n\t"
76 "fninit" 78 "fninit"
77 : "=m" (*&boot_cpu_data.fdiv_bug) 79 : "=m" (*&fdiv_bug)
78 : "m" (*&x), "m" (*&y)); 80 : "m" (*&x), "m" (*&y));
81
82 boot_cpu_data.fdiv_bug = fdiv_bug;
79 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
80 printk("Hmm, FPU with FDIV bug.\n"); 84 printk("Hmm, FPU with FDIV bug.\n");
81} 85}
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
235 If in doubt, say N. 235 If in doubt, say N.
236 236
237config X86_E_POWERSAVER 237config X86_E_POWERSAVER
238 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" 238 tristate "VIA C7 Enhanced PowerSaver"
239 select CPU_FREQ_TABLE 239 select CPU_FREQ_TABLE
240 depends on X86_32 && EXPERIMENTAL 240 depends on X86_32
241 help 241 help
242 This adds the CPUFreq driver for VIA C7 processors. 242 This adds the CPUFreq driver for VIA C7 processors.
243 243
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..dd097b835839 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd)
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 unsigned int i; 203 unsigned int i;
204 204
205 for_each_cpu_mask(i, cmd->mask) { 205 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
207 do_drv_write(cmd); 207 do_drv_write(cmd);
208 } 208 }
@@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 451
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 452 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 453 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 454 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 455 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 457 }
@@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 466 }
467 } 467 }
468 468
469 for_each_cpu_mask(i, cmd.mask) { 469 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 470 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 472 }
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..e4a4bf870e94 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
44 * It is important that the frequencies 44 * It is important that the frequencies
45 * are listed in ascending order here! 45 * are listed in ascending order here!
46 */ 46 */
47struct s_elan_multiplier elan_multiplier[] = { 47static struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18}, 48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10}, 49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08}, 50 {4000, 0x02, 0x08},
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..f1685fb91fbd 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..4e7271999a74 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid)
66 return 800 + (fid * 100); 66 return 800 + (fid * 100);
67} 67}
68 68
69
70/* Return a frequency in KHz, given an input fid */ 69/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid) 70static u32 find_khz_freq_from_fid(u32 fid)
72{ 71{
@@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
78 return data[pstate].frequency; 77 return data[pstate].frequency;
79} 78}
80 79
81
82/* Return the vco fid for an input fid 80/* Return the vco fid for an input fid
83 * 81 *
84 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids 82 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +164,6 @@ static void fidvid_msr_init(void)
166 wrmsr(MSR_FIDVID_CTL, lo, hi); 164 wrmsr(MSR_FIDVID_CTL, lo, hi);
167} 165}
168 166
169
170/* write the new fid value along with the other control fields to the msr */ 167/* write the new fid value along with the other control fields to the msr */
171static int write_new_fid(struct powernow_k8_data *data, u32 fid) 168static int write_new_fid(struct powernow_k8_data *data, u32 fid)
172{ 169{
@@ -740,44 +737,63 @@ static int find_psb_table(struct powernow_k8_data *data)
740#ifdef CONFIG_X86_POWERNOW_K8_ACPI 737#ifdef CONFIG_X86_POWERNOW_K8_ACPI
741static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) 738static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
742{ 739{
743 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 740 if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE))
744 return; 741 return;
745 742
746 data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; 743 data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK;
747 data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; 744 data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK;
748 data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 745 data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
749 data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; 746 data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
750 data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); 747 data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK);
751 data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; 748 data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK;
749}
750
751
752static struct acpi_processor_performance *acpi_perf_data;
753static int preregister_valid;
754
755static int powernow_k8_cpu_preinit_acpi(void)
756{
757 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
758 if (!acpi_perf_data)
759 return -ENODEV;
760
761 if (acpi_processor_preregister_performance(acpi_perf_data))
762 return -ENODEV;
763 else
764 preregister_valid = 1;
765 return 0;
752} 766}
753 767
754static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 768static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
755{ 769{
756 struct cpufreq_frequency_table *powernow_table; 770 struct cpufreq_frequency_table *powernow_table;
757 int ret_val; 771 int ret_val;
772 int cpu = 0;
758 773
759 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 774 data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
775 if (acpi_processor_register_performance(data->acpi_data, data->cpu)) {
760 dprintk("register performance failed: bad ACPI data\n"); 776 dprintk("register performance failed: bad ACPI data\n");
761 return -EIO; 777 return -EIO;
762 } 778 }
763 779
764 /* verify the data contained in the ACPI structures */ 780 /* verify the data contained in the ACPI structures */
765 if (data->acpi_data.state_count <= 1) { 781 if (data->acpi_data->state_count <= 1) {
766 dprintk("No ACPI P-States\n"); 782 dprintk("No ACPI P-States\n");
767 goto err_out; 783 goto err_out;
768 } 784 }
769 785
770 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 786 if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
771 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 787 (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
772 dprintk("Invalid control/status registers (%x - %x)\n", 788 dprintk("Invalid control/status registers (%x - %x)\n",
773 data->acpi_data.control_register.space_id, 789 data->acpi_data->control_register.space_id,
774 data->acpi_data.status_register.space_id); 790 data->acpi_data->status_register.space_id);
775 goto err_out; 791 goto err_out;
776 } 792 }
777 793
778 /* fill in data->powernow_table */ 794 /* fill in data->powernow_table */
779 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) 795 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
780 * (data->acpi_data.state_count + 1)), GFP_KERNEL); 796 * (data->acpi_data->state_count + 1)), GFP_KERNEL);
781 if (!powernow_table) { 797 if (!powernow_table) {
782 dprintk("powernow_table memory alloc failure\n"); 798 dprintk("powernow_table memory alloc failure\n");
783 goto err_out; 799 goto err_out;
@@ -790,12 +806,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
790 if (ret_val) 806 if (ret_val)
791 goto err_out_mem; 807 goto err_out_mem;
792 808
793 powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; 809 powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END;
794 powernow_table[data->acpi_data.state_count].index = 0; 810 powernow_table[data->acpi_data->state_count].index = 0;
795 data->powernow_table = powernow_table; 811 data->powernow_table = powernow_table;
796 812
797 /* fill in data */ 813 /* fill in data */
798 data->numps = data->acpi_data.state_count; 814 data->numps = data->acpi_data->state_count;
799 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) 815 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu)
800 print_basics(data); 816 print_basics(data);
801 powernow_k8_acpi_pst_values(data, 0); 817 powernow_k8_acpi_pst_values(data, 0);
@@ -803,16 +819,31 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
803 /* notify BIOS that we exist */ 819 /* notify BIOS that we exist */
804 acpi_processor_notify_smm(THIS_MODULE); 820 acpi_processor_notify_smm(THIS_MODULE);
805 821
822 /* determine affinity, from ACPI if available */
823 if (preregister_valid) {
824 if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) ||
825 (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY))
826 data->starting_core_affinity = data->acpi_data->shared_cpu_map;
827 else
828 data->starting_core_affinity = cpumask_of_cpu(data->cpu);
829 } else {
830 /* best guess from family if not */
831 if (cpu_family == CPU_HW_PSTATE)
832 data->starting_core_affinity = cpumask_of_cpu(data->cpu);
833 else
834 data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu);
835 }
836
806 return 0; 837 return 0;
807 838
808err_out_mem: 839err_out_mem:
809 kfree(powernow_table); 840 kfree(powernow_table);
810 841
811err_out: 842err_out:
812 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 843 acpi_processor_unregister_performance(data->acpi_data, data->cpu);
813 844
814 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ 845 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
815 data->acpi_data.state_count = 0; 846 data->acpi_data->state_count = 0;
816 847
817 return -ENODEV; 848 return -ENODEV;
818} 849}
@@ -824,10 +855,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
824 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 855 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
825 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 856 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
826 857
827 for (i = 0; i < data->acpi_data.state_count; i++) { 858 for (i = 0; i < data->acpi_data->state_count; i++) {
828 u32 index; 859 u32 index;
829 860
830 index = data->acpi_data.states[i].control & HW_PSTATE_MASK; 861 index = data->acpi_data->states[i].control & HW_PSTATE_MASK;
831 if (index > data->max_hw_pstate) { 862 if (index > data->max_hw_pstate) {
832 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); 863 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
833 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); 864 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
@@ -843,7 +874,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
843 874
844 powernow_table[i].index = index; 875 powernow_table[i].index = index;
845 876
846 powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; 877 powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000;
847 } 878 }
848 return 0; 879 return 0;
849} 880}
@@ -852,16 +883,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
852{ 883{
853 int i; 884 int i;
854 int cntlofreq = 0; 885 int cntlofreq = 0;
855 for (i = 0; i < data->acpi_data.state_count; i++) { 886 for (i = 0; i < data->acpi_data->state_count; i++) {
856 u32 fid; 887 u32 fid;
857 u32 vid; 888 u32 vid;
858 889
859 if (data->exttype) { 890 if (data->exttype) {
860 fid = data->acpi_data.states[i].status & EXT_FID_MASK; 891 fid = data->acpi_data->states[i].status & EXT_FID_MASK;
861 vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; 892 vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK;
862 } else { 893 } else {
863 fid = data->acpi_data.states[i].control & FID_MASK; 894 fid = data->acpi_data->states[i].control & FID_MASK;
864 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; 895 vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK;
865 } 896 }
866 897
867 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); 898 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
@@ -902,10 +933,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
902 cntlofreq = i; 933 cntlofreq = i;
903 } 934 }
904 935
905 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { 936 if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) {
906 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", 937 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
907 powernow_table[i].frequency, 938 powernow_table[i].frequency,
908 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); 939 (unsigned int) (data->acpi_data->states[i].core_frequency * 1000));
909 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 940 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
910 continue; 941 continue;
911 } 942 }
@@ -915,11 +946,12 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
915 946
916static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) 947static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
917{ 948{
918 if (data->acpi_data.state_count) 949 if (data->acpi_data->state_count)
919 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 950 acpi_processor_unregister_performance(data->acpi_data, data->cpu);
920} 951}
921 952
922#else 953#else
954static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; }
923static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } 955static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
924static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } 956static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
925static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } 957static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
@@ -966,7 +998,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 998 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 999 freqs.new = find_khz_freq_from_fid(fid);
968 1000
969 for_each_cpu_mask(i, *(data->available_cores)) { 1001 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 1002 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1003 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 1004 }
@@ -974,7 +1006,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 1006 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 1007 freqs.new = find_khz_freq_from_fid(data->currfid);
976 1008
977 for_each_cpu_mask(i, *(data->available_cores)) { 1009 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 1010 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1011 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 1012 }
@@ -997,7 +1029,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1029 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1030 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 1031
1000 for_each_cpu_mask(i, *(data->available_cores)) { 1032 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 1033 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1034 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1035 }
@@ -1005,7 +1037,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1037 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1038 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1039
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1040 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1041 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1042 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1043 }
@@ -1104,7 +1136,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1104static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1136static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1105{ 1137{
1106 struct powernow_k8_data *data; 1138 struct powernow_k8_data *data;
1107 cpumask_t oldmask; 1139 cpumask_t oldmask = CPU_MASK_ALL;
1108 int rc; 1140 int rc;
1109 1141
1110 if (!cpu_online(pol->cpu)) 1142 if (!cpu_online(pol->cpu))
@@ -1177,10 +1209,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1177 /* run on any CPU again */ 1209 /* run on any CPU again */
1178 set_cpus_allowed_ptr(current, &oldmask); 1210 set_cpus_allowed_ptr(current, &oldmask);
1179 1211
1180 if (cpu_family == CPU_HW_PSTATE) 1212 pol->cpus = data->starting_core_affinity;
1181 pol->cpus = cpumask_of_cpu(pol->cpu);
1182 else
1183 pol->cpus = per_cpu(cpu_core_map, pol->cpu);
1184 data->available_cores = &(pol->cpus); 1213 data->available_cores = &(pol->cpus);
1185 1214
1186 /* Take a crude guess here. 1215 /* Take a crude guess here.
@@ -1303,6 +1332,7 @@ static int __cpuinit powernowk8_init(void)
1303 } 1332 }
1304 1333
1305 if (supported_cpus == num_online_cpus()) { 1334 if (supported_cpus == num_online_cpus()) {
1335 powernow_k8_cpu_preinit_acpi();
1306 printk(KERN_INFO PFX "Found %d %s " 1336 printk(KERN_INFO PFX "Found %d %s "
1307 "processors (%d cpu cores) (" VERSION ")\n", 1337 "processors (%d cpu cores) (" VERSION ")\n",
1308 num_online_nodes(), 1338 num_online_nodes(),
@@ -1319,6 +1349,10 @@ static void __exit powernowk8_exit(void)
1319 dprintk("exit\n"); 1349 dprintk("exit\n");
1320 1350
1321 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1351 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1352
1353#ifdef CONFIG_X86_POWERNOW_K8_ACPI
1354 free_percpu(acpi_perf_data);
1355#endif
1322} 1356}
1323 1357
1324MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>"); 1358MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index ab48cfed4d96..a62612cd4be8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -33,12 +33,13 @@ struct powernow_k8_data {
33#ifdef CONFIG_X86_POWERNOW_K8_ACPI 33#ifdef CONFIG_X86_POWERNOW_K8_ACPI
34 /* the acpi table needs to be kept. it's only available if ACPI was 34 /* the acpi table needs to be kept. it's only available if ACPI was
35 * used to determine valid frequency/vid/fid states */ 35 * used to determine valid frequency/vid/fid states */
36 struct acpi_processor_performance acpi_data; 36 struct acpi_processor_performance *acpi_data;
37#endif 37#endif
38 /* we need to keep track of associated cores, but let cpufreq 38 /* we need to keep track of associated cores, but let cpufreq
39 * handle hotplug events - so just point at cpufreq pol->cpus 39 * handle hotplug events - so just point at cpufreq pol->cpus
40 * structure */ 40 * structure */
41 cpumask_t *available_cores; 41 cpumask_t *available_cores;
42 cpumask_t starting_core_affinity;
42}; 43};
43 44
44 45
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..15e13c01cc36 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -28,7 +28,8 @@
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@lists.linux.org.uk"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 358 int i;
348 359
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 360 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 361 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
362 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 363 return -ENODEV;
352 364
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 365 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 373 break;
362 374
363 if (i != N_IDS) 375 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 376 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 377
366 if (!centrino_cpu[policy->cpu]) { 378 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 379 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 380 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 381 MAINTAINER "\n");
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 398 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 400 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 403 return -ENODEV;
391 } 404 }
392 } 405 }
393 406
394 freq = get_cur_freq(policy->cpu); 407 freq = get_cur_freq(policy->cpu);
395 408 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 409 /* 10uS transition latency */
397 policy->cur = freq; 410 policy->cur = freq;
398 411
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 412 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 413
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 414 ret = cpufreq_frequency_table_cpuinfo(policy,
415 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 416 if (ret)
403 return (ret); 417 return (ret);
404 418
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 419 cpufreq_frequency_table_get_attr(
420 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 421
407 return 0; 422 return 0;
408} 423}
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 426{
412 unsigned int cpu = policy->cpu; 427 unsigned int cpu = policy->cpu;
413 428
414 if (!centrino_model[cpu]) 429 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 430 return -ENODEV;
416 431
417 cpufreq_frequency_table_put_attr(cpu); 432 cpufreq_frequency_table_put_attr(cpu);
418 433
419 centrino_model[cpu] = NULL; 434 per_cpu(centrino_model, cpu) = NULL;
420 435
421 return 0; 436 return 0;
422} 437}
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 445 */
431static int centrino_verify (struct cpufreq_policy *policy) 446static int centrino_verify (struct cpufreq_policy *policy)
432{ 447{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 448 return cpufreq_frequency_table_verify(policy,
449 per_cpu(centrino_model, policy->cpu)->op_points);
434} 450}
435 451
436/** 452/**
437 * centrino_setpolicy - set a new CPUFreq policy 453 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 454 * @policy: new policy
439 * @target_freq: the target frequency 455 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 456 * @relation: how that frequency relates to achieved frequency
457 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 458 *
442 * Sets a new CPUFreq policy. 459 * Sets a new CPUFreq policy.
443 */ 460 */
461struct allmasks {
462 cpumask_t online_policy_cpus;
463 cpumask_t saved_mask;
464 cpumask_t set_mask;
465 cpumask_t covered_cpus;
466};
467
444static int centrino_target (struct cpufreq_policy *policy, 468static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 469 unsigned int target_freq,
446 unsigned int relation) 470 unsigned int relation)
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 472 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 473 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 474 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 475 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 476 unsigned int j, k, first_cpu, tmp;
457 477 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 478 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 479 CPUMASK_PTR(saved_mask, allmasks);
480 CPUMASK_PTR(set_mask, allmasks);
481 CPUMASK_PTR(covered_cpus, allmasks);
482
483 if (unlikely(allmasks == NULL))
484 return -ENOMEM;
485
486 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
487 retval = -ENODEV;
488 goto out;
489 }
460 490
461 if (unlikely(cpufreq_frequency_table_target(policy, 491 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 492 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 493 target_freq,
464 relation, 494 relation,
465 &newstate))) { 495 &newstate))) {
466 return -EINVAL; 496 retval = -EINVAL;
497 goto out;
467 } 498 }
468 499
469#ifdef CONFIG_HOTPLUG_CPU 500#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 501 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 502 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 503#else
473 online_policy_cpus = policy->cpus; 504 *online_policy_cpus = policy->cpus;
474#endif 505#endif
475 506
476 saved_mask = current->cpus_allowed; 507 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 508 first_cpu = 1;
478 cpus_clear(covered_cpus); 509 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 510 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 511 /*
481 * Support for SMP systems. 512 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 513 * Make sure we are running on CPU that wants to change freq
483 */ 514 */
484 cpus_clear(set_mask); 515 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 516 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 517 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 518 else
488 cpu_set(j, set_mask); 519 cpu_set(j, *set_mask);
489 520
490 set_cpus_allowed_ptr(current, &set_mask); 521 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 522 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 523 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 524 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 525 retval = -EAGAIN;
495 if (first_cpu) { 526 if (first_cpu) {
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 531 break;
501 } 532 }
502 533
503 msr = centrino_model[cpu]->op_points[newstate].index; 534 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 535
505 if (first_cpu) { 536 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 537 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 548 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 549 target_freq, freqs.old, freqs.new, msr);
519 550
520 for_each_cpu_mask(k, online_policy_cpus) { 551 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 552 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 553 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 554 CPUFREQ_PRECHANGE);
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 567 break;
537 } 568 }
538 569
539 cpu_set(j, covered_cpus); 570 cpu_set(j, *covered_cpus);
540 preempt_enable(); 571 preempt_enable();
541 } 572 }
542 573
543 for_each_cpu_mask(k, online_policy_cpus) { 574 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 575 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 576 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 577 }
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 584 * Best effort undo..
554 */ 585 */
555 586
556 if (!cpus_empty(covered_cpus)) { 587 if (!cpus_empty(*covered_cpus))
557 for_each_cpu_mask(j, covered_cpus) { 588 for_each_cpu_mask_nr(j, *covered_cpus) {
558 set_cpus_allowed_ptr(current, 589 set_cpus_allowed_ptr(current,
559 &cpumask_of_cpu(j)); 590 &cpumask_of_cpu(j));
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 592 }
562 }
563 593
564 tmp = freqs.new; 594 tmp = freqs.new;
565 freqs.new = freqs.old; 595 freqs.new = freqs.old;
566 freqs.old = tmp; 596 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 597 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 598 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 599 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 600 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 601 }
572 } 602 }
573 set_cpus_allowed_ptr(current, &saved_mask); 603 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 604 retval = 0;
605 goto out;
575 606
576migrate_end: 607migrate_end:
577 preempt_enable(); 608 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 609 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 610out:
611 CPUMASK_FREE(allmasks);
612 return retval;
580} 613}
581 614
582static struct freq_attr* centrino_attr[] = { 615static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..191f7263c61d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 279
280 cpus_allowed = current->cpus_allowed; 280 cpus_allowed = current->cpus_allowed;
281 281
282 for_each_cpu_mask(i, policy->cpus) { 282 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 285 }
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask(i, policy->cpus) { 295 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 298 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ff517f0b8cc4..6b0a10b002f1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -489,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
489 int sibling; 489 int sibling;
490 490
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 491 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 493 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 494 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 495 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..65a339678ece 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
580 char __user *buf = ubuf; 580 char __user *buf = ubuf;
581 int i, err; 581 int i, err;
582 582
583 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
584 if (!cpu_tsc) 584 if (!cpu_tsc)
585 return -ENOMEM; 585 return -ENOMEM;
586 586
@@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
762 762
763/* Why are there no generic functions for this? */ 763/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 764#define ACCESSOR(name, var, start) \
765 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 765 static ssize_t show_ ## name(struct sys_device *s, \
766 struct sysdev_attribute *attr, \
767 char *buf) { \
766 return sprintf(buf, "%lx\n", (unsigned long)var); \ 768 return sprintf(buf, "%lx\n", (unsigned long)var); \
767 } \ 769 } \
768 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 770 static ssize_t set_ ## name(struct sys_device *s, \
771 struct sysdev_attribute *attr, \
772 const char *buf, size_t siz) { \
769 char *end; \ 773 char *end; \
770 unsigned long new = simple_strtoul(buf, &end, 0); \ 774 unsigned long new = simple_strtoul(buf, &end, 0); \
771 if (end == buf) return -EINVAL; \ 775 if (end == buf) return -EINVAL; \
@@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
786ACCESSOR(bank4ctl,bank[4],mce_restart()) 790ACCESSOR(bank4ctl,bank[4],mce_restart())
787ACCESSOR(bank5ctl,bank[5],mce_restart()) 791ACCESSOR(bank5ctl,bank[5],mce_restart())
788 792
789static ssize_t show_trigger(struct sys_device *s, char *buf) 793static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
794 char *buf)
790{ 795{
791 strcpy(buf, trigger); 796 strcpy(buf, trigger);
792 strcat(buf, "\n"); 797 strcat(buf, "\n");
793 return strlen(trigger) + 1; 798 return strlen(trigger) + 1;
794} 799}
795 800
796static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 801static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
802 const char *buf,size_t siz)
797{ 803{
798 char *p; 804 char *p;
799 int len; 805 int len;
@@ -806,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
806} 812}
807 813
808static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 814static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
809ACCESSOR(tolerant,tolerant,) 815static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
810ACCESSOR(check_interval,check_interval,mce_restart()) 816ACCESSOR(check_interval,check_interval,mce_restart())
811static struct sysdev_attribute *mce_attributes[] = { 817static struct sysdev_attribute *mce_attributes[] = {
812 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 818 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
813 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 819 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
814 &attr_tolerant, &attr_check_interval, &attr_trigger, 820 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
815 NULL 821 NULL
816}; 822};
817 823
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..88736cadbaa6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..a26c480b9491 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 if ((*pos) < nr_cpu_ids && cpu_online(*pos))
164 return &cpu_data(*pos); 164 return &cpu_data(*pos);
165 return NULL; 165 return NULL;
166} 166}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..14b11b3be31c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -141,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu)
141{ 141{
142 struct device *dev; 142 struct device *dev;
143 143
144 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 144 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
145 "cpu%d", cpu); 145 NULL, "cpu%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 146 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 147}
148 148
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cdfd94cc6b14..109792bc7cfa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56 56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
66
57/* 67/*
58 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
59 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
@@ -333,7 +343,8 @@ sysenter_past_esp:
333 343
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
337 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 349 jae syscall_badsys
339 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
343 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
344 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
345 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
346 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
347/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
348 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
349 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
351 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3521: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
354 CFI_ENDPROC 405 CFI_ENDPROC
355.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3562: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8410e26f4183..89434d439605 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,6 +53,12 @@
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55 55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
61
56 .code64 62 .code64
57 63
58#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
@@ -351,6 +357,7 @@ ENTRY(system_call_after_swapgs)
351 GET_THREAD_INFO(%rcx) 357 GET_THREAD_INFO(%rcx)
352 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) 358 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 jnz tracesys 359 jnz tracesys
360system_call_fastpath:
354 cmpq $__NR_syscall_max,%rax 361 cmpq $__NR_syscall_max,%rax
355 ja badsys 362 ja badsys
356 movq %r10,%rcx 363 movq %r10,%rcx
@@ -402,16 +409,16 @@ sysret_careful:
402sysret_signal: 409sysret_signal:
403 TRACE_IRQS_ON 410 TRACE_IRQS_ON
404 ENABLE_INTERRUPTS(CLBR_NONE) 411 ENABLE_INTERRUPTS(CLBR_NONE)
405 testl $_TIF_DO_NOTIFY_MASK,%edx 412#ifdef CONFIG_AUDITSYSCALL
406 jz 1f 413 bt $TIF_SYSCALL_AUDIT,%edx
407 414 jc sysret_audit
408 /* Really a signal */ 415#endif
409 /* edx: work flags (arg3) */ 416 /* edx: work flags (arg3) */
410 leaq do_notify_resume(%rip),%rax 417 leaq do_notify_resume(%rip),%rax
411 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 418 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
412 xorl %esi,%esi # oldset -> arg2 419 xorl %esi,%esi # oldset -> arg2
413 call ptregscall_common 420 call ptregscall_common
4141: movl $_TIF_WORK_MASK,%edi 421 movl $_TIF_WORK_MASK,%edi
415 /* Use IRET because user could have changed frame. This 422 /* Use IRET because user could have changed frame. This
416 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 423 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
417 DISABLE_INTERRUPTS(CLBR_NONE) 424 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -422,8 +429,45 @@ badsys:
422 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 429 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
423 jmp ret_from_sys_call 430 jmp ret_from_sys_call
424 431
432#ifdef CONFIG_AUDITSYSCALL
433 /*
434 * Fast path for syscall audit without full syscall trace.
435 * We just call audit_syscall_entry() directly, and then
436 * jump back to the normal fast path.
437 */
438auditsys:
439 movq %r10,%r9 /* 6th arg: 4th syscall arg */
440 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
441 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
442 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
443 movq %rax,%rsi /* 2nd arg: syscall number */
444 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
445 call audit_syscall_entry
446 LOAD_ARGS 0 /* reload call-clobbered registers */
447 jmp system_call_fastpath
448
449 /*
450 * Return fast path for syscall audit. Call audit_syscall_exit()
451 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
452 * masked off.
453 */
454sysret_audit:
455 movq %rax,%rsi /* second arg, syscall return value */
456 cmpq $0,%rax /* is it < 0? */
457 setl %al /* 1 if so, 0 if not */
458 movzbl %al,%edi /* zero-extend that into %edi */
459 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
460 call audit_syscall_exit
461 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
462 jmp sysret_check
463#endif /* CONFIG_AUDITSYSCALL */
464
425 /* Do syscall tracing */ 465 /* Do syscall tracing */
426tracesys: 466tracesys:
467#ifdef CONFIG_AUDITSYSCALL
468 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
469 jz auditsys
470#endif
427 SAVE_REST 471 SAVE_REST
428 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 472 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
429 FIXUP_TOP_OF_STACK %rdi 473 FIXUP_TOP_OF_STACK %rdi
@@ -448,6 +492,7 @@ tracesys:
448 * Has correct top of stack, but partial stack frame. 492 * Has correct top of stack, but partial stack frame.
449 */ 493 */
450 .globl int_ret_from_sys_call 494 .globl int_ret_from_sys_call
495 .globl int_with_check
451int_ret_from_sys_call: 496int_ret_from_sys_call:
452 DISABLE_INTERRUPTS(CLBR_NONE) 497 DISABLE_INTERRUPTS(CLBR_NONE)
453 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd217..eaff0bbb1444 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -99,3 +99,4 @@ int is_uv_system(void)
99{ 99{
100 return uv_system_type != UV_NONE; 100 return uv_system_type != UV_NONE;
101} 101}
102EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..786548a62d38 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 168 * May as well be the first.
169 */ 169 */
170 cpu = first_cpu(cpumask); 170 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 171 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 172 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 173 else
174 return BAD_APICID; 174 return BAD_APICID;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 3c3929340692..2cfcbded888a 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -98,7 +98,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
98{ 98{
99 unsigned int cpu; 99 unsigned int cpu;
100 100
101 for (cpu = 0; cpu < NR_CPUS; ++cpu) 101 for_each_possible_cpu(cpu)
102 if (cpu_isset(cpu, mask)) 102 if (cpu_isset(cpu, mask))
103 uv_send_IPI_one(cpu, vector); 103 uv_send_IPI_one(cpu, vector);
104} 104}
@@ -132,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
132 * May as well be the first. 132 * May as well be the first.
133 */ 133 */
134 cpu = first_cpu(cpumask); 134 cpu = first_cpu(cpumask);
135 if ((unsigned)cpu < NR_CPUS) 135 if ((unsigned)cpu < nr_cpu_ids)
136 return per_cpu(x86_cpu_to_apicid, cpu); 136 return per_cpu(x86_cpu_to_apicid, cpu);
137 else 137 else
138 return BAD_APICID; 138 return BAD_APICID;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..a7010c3a377a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP
4561: 4561:
457#endif /* CONFIG_SMP */ 457#endif /* CONFIG_SMP */
458 jmp *(initial_code) 458 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
462 459
463/* 460/*
464 * We depend on ET to be correct. This checks for 287/387. 461 * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +598,11 @@ ignore_int:
601#endif 598#endif
602 iret 599 iret
603 600
601.section .cpuinit.data,"wa"
602.align 4
603ENTRY(initial_code)
604 .long i386_start_kernel
605
604.section .text 606.section .text
605/* 607/*
606 * Real beginning of normal "text" segment 608 * Real beginning of normal "text" segment
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..ad2b15a1334d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -468,7 +468,7 @@ void hpet_disable(void)
468#define RTC_NUM_INTS 1 468#define RTC_NUM_INTS 1
469 469
470static unsigned long hpet_rtc_flags; 470static unsigned long hpet_rtc_flags;
471static unsigned long hpet_prev_update_sec; 471static int hpet_prev_update_sec;
472static struct rtc_time hpet_alarm_time; 472static struct rtc_time hpet_alarm_time;
473static unsigned long hpet_pie_count; 473static unsigned long hpet_pie_count;
474static unsigned long hpet_t1_cmp; 474static unsigned long hpet_t1_cmp;
@@ -575,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
575 575
576 hpet_rtc_flags |= bit_mask; 576 hpet_rtc_flags |= bit_mask;
577 577
578 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
579 hpet_prev_update_sec = -1;
580
578 if (!oldbits) 581 if (!oldbits)
579 hpet_rtc_timer_init(); 582 hpet_rtc_timer_init();
580 583
@@ -652,7 +655,7 @@ static void hpet_rtc_timer_reinit(void)
652 if (hpet_rtc_flags & RTC_PIE) 655 if (hpet_rtc_flags & RTC_PIE)
653 hpet_pie_count += lost_ints; 656 hpet_pie_count += lost_ints;
654 if (printk_ratelimit()) 657 if (printk_ratelimit())
655 printk(KERN_WARNING "rtc: lost %d interrupts\n", 658 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
656 lost_ints); 659 lost_ints);
657 } 660 }
658} 661}
@@ -670,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
670 673
671 if (hpet_rtc_flags & RTC_UIE && 674 if (hpet_rtc_flags & RTC_UIE &&
672 curr_time.tm_sec != hpet_prev_update_sec) { 675 curr_time.tm_sec != hpet_prev_update_sec) {
673 rtc_int_flag = RTC_UF; 676 if (hpet_prev_update_sec >= 0)
677 rtc_int_flag = RTC_UF;
674 hpet_prev_update_sec = curr_time.tm_sec; 678 hpet_prev_update_sec = curr_time.tm_sec;
675 } 679 }
676 680
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index de9aa0e3a9c5..09cddb57bec4 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -57,7 +57,7 @@ atomic_t irq_mis_count;
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58 58
59static DEFINE_SPINLOCK(ioapic_lock); 59static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock); 60DEFINE_SPINLOCK(vector_lock);
61 61
62int timer_through_8259 __initdata; 62int timer_through_8259 __initdata;
63 63
@@ -1209,10 +1209,6 @@ static int assign_irq_vector(int irq)
1209 return vector; 1209 return vector;
1210} 1210}
1211 1211
1212void setup_vector_irq(int cpu)
1213{
1214}
1215
1216static struct irq_chip ioapic_chip; 1212static struct irq_chip ioapic_chip;
1217 1213
1218#define IOAPIC_AUTO -1 1214#define IOAPIC_AUTO -1
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 64a46affd858..61a83b70c18f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -101,7 +101,7 @@ int timer_through_8259 __initdata;
101static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 101static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
102 102
103static DEFINE_SPINLOCK(ioapic_lock); 103static DEFINE_SPINLOCK(ioapic_lock);
104DEFINE_SPINLOCK(vector_lock); 104static DEFINE_SPINLOCK(vector_lock);
105 105
106/* 106/*
107 * # of IRQ routing registers 107 * # of IRQ routing registers
@@ -697,6 +697,19 @@ static int pin_2_irq(int idx, int apic, int pin)
697 return irq; 697 return irq;
698} 698}
699 699
700void lock_vector_lock(void)
701{
702 /* Used to the online set of cpus does not change
703 * during assign_irq_vector.
704 */
705 spin_lock(&vector_lock);
706}
707
708void unlock_vector_lock(void)
709{
710 spin_unlock(&vector_lock);
711}
712
700static int __assign_irq_vector(int irq, cpumask_t mask) 713static int __assign_irq_vector(int irq, cpumask_t mask)
701{ 714{
702 /* 715 /*
@@ -732,7 +745,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
732 return 0; 745 return 0;
733 } 746 }
734 747
735 for_each_cpu_mask(cpu, mask) { 748 for_each_cpu_mask_nr(cpu, mask) {
736 cpumask_t domain, new_mask; 749 cpumask_t domain, new_mask;
737 int new_cpu; 750 int new_cpu;
738 int vector, offset; 751 int vector, offset;
@@ -753,7 +766,7 @@ next:
753 continue; 766 continue;
754 if (vector == IA32_SYSCALL_VECTOR) 767 if (vector == IA32_SYSCALL_VECTOR)
755 goto next; 768 goto next;
756 for_each_cpu_mask(new_cpu, new_mask) 769 for_each_cpu_mask_nr(new_cpu, new_mask)
757 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 770 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
758 goto next; 771 goto next;
759 /* Found one! */ 772 /* Found one! */
@@ -763,7 +776,7 @@ next:
763 cfg->move_in_progress = 1; 776 cfg->move_in_progress = 1;
764 cfg->old_domain = cfg->domain; 777 cfg->old_domain = cfg->domain;
765 } 778 }
766 for_each_cpu_mask(new_cpu, new_mask) 779 for_each_cpu_mask_nr(new_cpu, new_mask)
767 per_cpu(vector_irq, new_cpu)[vector] = irq; 780 per_cpu(vector_irq, new_cpu)[vector] = irq;
768 cfg->vector = vector; 781 cfg->vector = vector;
769 cfg->domain = domain; 782 cfg->domain = domain;
@@ -795,14 +808,14 @@ static void __clear_irq_vector(int irq)
795 808
796 vector = cfg->vector; 809 vector = cfg->vector;
797 cpus_and(mask, cfg->domain, cpu_online_map); 810 cpus_and(mask, cfg->domain, cpu_online_map);
798 for_each_cpu_mask(cpu, mask) 811 for_each_cpu_mask_nr(cpu, mask)
799 per_cpu(vector_irq, cpu)[vector] = -1; 812 per_cpu(vector_irq, cpu)[vector] = -1;
800 813
801 cfg->vector = 0; 814 cfg->vector = 0;
802 cpus_clear(cfg->domain); 815 cpus_clear(cfg->domain);
803} 816}
804 817
805static void __setup_vector_irq(int cpu) 818void __setup_vector_irq(int cpu)
806{ 819{
807 /* Initialize vector_irq on a new cpu */ 820 /* Initialize vector_irq on a new cpu */
808 /* This function must be called with vector_lock held */ 821 /* This function must be called with vector_lock held */
@@ -825,14 +838,6 @@ static void __setup_vector_irq(int cpu)
825 } 838 }
826} 839}
827 840
828void setup_vector_irq(int cpu)
829{
830 spin_lock(&vector_lock);
831 __setup_vector_irq(smp_processor_id());
832 spin_unlock(&vector_lock);
833}
834
835
836static struct irq_chip ioapic_chip; 841static struct irq_chip ioapic_chip;
837 842
838static void ioapic_register_intr(int irq, unsigned long trigger) 843static void ioapic_register_intr(int irq, unsigned long trigger)
@@ -1373,12 +1378,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1373static int ioapic_retrigger_irq(unsigned int irq) 1378static int ioapic_retrigger_irq(unsigned int irq)
1374{ 1379{
1375 struct irq_cfg *cfg = &irq_cfg[irq]; 1380 struct irq_cfg *cfg = &irq_cfg[irq];
1376 cpumask_t mask;
1377 unsigned long flags; 1381 unsigned long flags;
1378 1382
1379 spin_lock_irqsave(&vector_lock, flags); 1383 spin_lock_irqsave(&vector_lock, flags);
1380 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 1384 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1381 send_IPI_mask(mask, cfg->vector);
1382 spin_unlock_irqrestore(&vector_lock, flags); 1385 spin_unlock_irqrestore(&vector_lock, flags);
1383 1386
1384 return 1; 1387 return 1;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..1f26fd9ec4f4 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
43 43
44#define BUILD_IRQ(nr) \ 44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \ 45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.p2align\n" \ 46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \ 47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \ 48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt"); 49 "jmp common_interrupt\n" \
50 ".previous");
50 51
51#define BI(x,y) \ 52#define BI(x,y) \
52 BUILD_IRQ(x##y) 53 BUILD_IRQ(x##y)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 43c019f85f0d..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c49ff2dc8f83..0ed5f939b905 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -63,12 +63,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
63 63
64 if (reload) { 64 if (reload) {
65#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
66 cpumask_t mask;
67
68 preempt_disable(); 66 preempt_disable();
69 load_LDT(pc); 67 load_LDT(pc);
70 mask = cpumask_of_cpu(smp_processor_id()); 68 if (!cpus_equal(current->mm->cpu_vm_mask,
71 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 69 cpumask_of_cpu(smp_processor_id())))
72 smp_call_function(flush_ldt, current->mm, 1); 70 smp_call_function(flush_ldt, current->mm, 1);
73 preempt_enable(); 71 preempt_enable();
74#else 72#else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..9fe478d98406 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/system.h> 24#include <asm/system.h>
25#include <asm/cacheflush.h>
25 26
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 27#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 28static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -85,10 +86,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 86 * reboot code buffer to allow us to avoid allocations
86 * later. 87 * later.
87 * 88 *
88 * Currently nothing. 89 * Make control page executable.
89 */ 90 */
90int machine_kexec_prepare(struct kimage *image) 91int machine_kexec_prepare(struct kimage *image)
91{ 92{
93 if (nx_enabled)
94 set_pages_x(image->control_code_page, 1);
92 return 0; 95 return 0;
93} 96}
94 97
@@ -98,27 +101,48 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 101 */
99void machine_kexec_cleanup(struct kimage *image) 102void machine_kexec_cleanup(struct kimage *image)
100{ 103{
104 if (nx_enabled)
105 set_pages_nx(image->control_code_page, 1);
101} 106}
102 107
103/* 108/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 109 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 110 * We are past the point of no return, committed to rebooting now.
106 */ 111 */
107NORET_TYPE void machine_kexec(struct kimage *image) 112void machine_kexec(struct kimage *image)
108{ 113{
109 unsigned long page_list[PAGES_NR]; 114 unsigned long page_list[PAGES_NR];
110 void *control_page; 115 void *control_page;
116 asmlinkage unsigned long
117 (*relocate_kernel_ptr)(unsigned long indirection_page,
118 unsigned long control_page,
119 unsigned long start_address,
120 unsigned int has_pae,
121 unsigned int preserve_context);
111 122
112 tracer_disable(); 123 tracer_disable();
113 124
114 /* Interrupts aren't acceptable while we reboot */ 125 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 126 local_irq_disable();
116 127
128 if (image->preserve_context) {
129#ifdef CONFIG_X86_IO_APIC
130 /* We need to put APICs in legacy mode so that we can
131 * get timer interrupts in second kernel. kexec/kdump
132 * paths already have calls to disable_IO_APIC() in
133 * one form or other. kexec jump path also need
134 * one.
135 */
136 disable_IO_APIC();
137#endif
138 }
139
117 control_page = page_address(image->control_code_page); 140 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 141 memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
119 142
143 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 144 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 145 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 146 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 147 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 148#ifdef CONFIG_X86_PAE
@@ -131,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 155 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 156 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 157 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
158 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 159
135 /* The segment registers are funny things, they have both a 160 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 161 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 174 set_idt(phys_to_virt(0),0);
150 175
151 /* now call it */ 176 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 177 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 178 (unsigned long)page_list,
179 image->start, cpu_has_pae,
180 image->preserve_context);
154} 181}
155 182
156void arch_crash_save_vmcoreinfo(void) 183void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 56b933119a04..652fa5c38ebe 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu)
644 mutex_unlock(&microcode_mutex); 644 mutex_unlock(&microcode_mutex);
645} 645}
646 646
647static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) 647static ssize_t reload_store(struct sys_device *dev,
648 struct sysdev_attribute *attr,
649 const char *buf, size_t sz)
648{ 650{
649 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 651 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
650 char *end; 652 char *end;
@@ -655,9 +657,7 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
655 if (end == buf) 657 if (end == buf)
656 return -EINVAL; 658 return -EINVAL;
657 if (val == 1) { 659 if (val == 1) {
658 cpumask_t old; 660 cpumask_t old = current->cpus_allowed;
659
660 old = current->cpus_allowed;
661 661
662 get_online_cpus(); 662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
@@ -674,14 +674,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
674 return sz; 674 return sz;
675} 675}
676 676
677static ssize_t version_show(struct sys_device *dev, char *buf) 677static ssize_t version_show(struct sys_device *dev,
678 struct sysdev_attribute *attr, char *buf)
678{ 679{
679 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 680 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
680 681
681 return sprintf(buf, "0x%x\n", uci->rev); 682 return sprintf(buf, "0x%x\n", uci->rev);
682} 683}
683 684
684static ssize_t pf_show(struct sys_device *dev, char *buf) 685static ssize_t pf_show(struct sys_device *dev,
686 struct sysdev_attribute *attr, char *buf)
685{ 687{
686 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 688 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
687 689
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 0e867676b5a5..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 6ae005ccaed8..678090508a62 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -83,7 +83,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
83 if (x86_quirks->mpc_oem_bus_info) 83 if (x86_quirks->mpc_oem_bus_info)
84 x86_quirks->mpc_oem_bus_info(m, str); 84 x86_quirks->mpc_oem_bus_info(m, str);
85 else 85 else
86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); 86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
87 87
88#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
89 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -154,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
154 154
155static void print_MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
156{ 156{
157 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
158 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
159 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -163,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
163 163
164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
165{ 165{
166 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
167 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -235,7 +235,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
235 235
236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
237{ 237{
238 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," 238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
240 m->mpc_irqtype, m->mpc_irqflag & 3, 240 m->mpc_irqtype, m->mpc_irqflag & 3,
241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -695,7 +695,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
695 unsigned int *bp = phys_to_virt(base); 695 unsigned int *bp = phys_to_virt(base);
696 struct intel_mp_floating *mpf; 696 struct intel_mp_floating *mpf;
697 697
698 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); 698 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
699 bp, length);
699 BUILD_BUG_ON(sizeof(*mpf) != 16); 700 BUILD_BUG_ON(sizeof(*mpf) != 16);
700 701
701 while (length > 0) { 702 while (length > 0) {
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..9fd809552447 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -149,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu)
149{ 149{
150 struct device *dev; 150 struct device *dev;
151 151
152 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 152 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
153 "msr%d", cpu); 153 NULL, "msr%d", cpu);
154 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 154 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
155} 155}
156 156
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 097d8a6797fa..94da4d52d798 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
443#endif /* PAGETABLE_LEVELS >= 3 */ 443#endif /* PAGETABLE_LEVELS >= 3 */
444 444
445 .pte_val = native_pte_val, 445 .pte_val = native_pte_val,
446 .pte_flags = native_pte_val, 446 .pte_flags = native_pte_flags,
447 .pgd_val = native_pgd_val, 447 .pgd_val = native_pgd_val,
448 448
449 .make_pte = native_make_pte, 449 .make_pte = native_make_pte,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 151f2d171f7c..02d19328525d 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,6 +37,7 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
40
39#include <asm/iommu.h> 41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -410,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
410 } 414 }
411} 415}
412 416
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 417static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 418 int nelems, int direction)
431{ 419{
@@ -436,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 424 unsigned long entry;
437 int i; 425 int i;
438 426
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 427 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 428 BUG_ON(!sg_page(s));
444 429
@@ -474,7 +459,6 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 459static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 460 size_t size, int direction)
476{ 461{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 462 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 463 unsigned long uaddr;
480 unsigned int npages; 464 unsigned int npages;
@@ -483,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
483 uaddr = (unsigned long)vaddr; 467 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 468 npages = num_dma_pages(uaddr, size);
485 469
486 if (translation_enabled(tbl)) 470 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 471}
493 472
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 473static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 476 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 477 unsigned int npages;
499 478
500 if (!translation_enabled(tbl))
501 return;
502
503 npages = num_dma_pages(dma_handle, size); 479 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 480 iommu_free(tbl, dma_handle, npages);
505} 481}
@@ -522,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
522 goto error; 498 goto error;
523 memset(ret, 0, size); 499 memset(ret, 0, size);
524 500
525 if (translation_enabled(tbl)) { 501 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 502 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 503 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 504 goto free;
529 goto free; 505 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 506 return ret;
536
537free: 507free:
538 free_pages((unsigned long)ret, get_order(size)); 508 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 509 ret = NULL;
@@ -541,7 +511,7 @@ error:
541 return ret; 511 return ret;
542} 512}
543 513
544static const struct dma_mapping_ops calgary_dma_ops = { 514static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 515 .alloc_coherent = calgary_alloc_coherent,
546 .map_single = calgary_map_single, 516 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 517 .unmap_single = calgary_unmap_single,
@@ -830,7 +800,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 800
831 tbl = pci_iommu(dev->bus); 801 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 802 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 803
804 if (is_kdump_kernel())
805 calgary_init_bitmap_from_tce_table(tbl);
806 else
807 tce_free(tbl, 0, tbl->it_size);
834 808
835 if (is_calgary(dev->device)) 809 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 810 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1183,10 @@ static int __init calgary_init(void)
1209 if (ret) 1183 if (ret)
1210 return ret; 1184 return ret;
1211 1185
1186 /* Purely for kdump kernel case */
1187 if (is_kdump_kernel())
1188 get_tce_space_from_tar();
1189
1212 do { 1190 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1191 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1192 if (!dev)
@@ -1230,6 +1208,16 @@ static int __init calgary_init(void)
1230 goto error; 1208 goto error;
1231 } while (1); 1209 } while (1);
1232 1210
1211 dev = NULL;
1212 for_each_pci_dev(dev) {
1213 struct iommu_table *tbl;
1214
1215 tbl = find_iommu_table(&dev->dev);
1216
1217 if (translation_enabled(tbl))
1218 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1219 }
1220
1233 return ret; 1221 return ret;
1234 1222
1235error: 1223error:
@@ -1251,6 +1239,7 @@ error:
1251 calgary_disable_translation(dev); 1239 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1240 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1241 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1242 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1243 } while (1);
1255 1244
1256 return ret; 1245 return ret;
@@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1328 return (val != 0xffffffff);
1340} 1329}
1341 1330
1331/*
1332 * calgary_init_bitmap_from_tce_table():
1333 * Funtion for kdump case. In the second/kdump kernel initialize
1334 * the bitmap based on the tce table entries obtained from first kernel
1335 */
1336static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1337{
1338 u64 *tp;
1339 unsigned int index;
1340 tp = ((u64 *)tbl->it_base);
1341 for (index = 0 ; index < tbl->it_size; index++) {
1342 if (*tp != 0x0)
1343 set_bit(index, tbl->it_map);
1344 tp++;
1345 }
1346}
1347
1348/*
1349 * get_tce_space_from_tar():
1350 * Function for kdump case. Get the tce tables from first kernel
1351 * by reading the contents of the base adress register of calgary iommu
1352 */
1353static void get_tce_space_from_tar(void)
1354{
1355 int bus;
1356 void __iomem *target;
1357 unsigned long tce_space;
1358
1359 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1360 struct calgary_bus_info *info = &bus_info[bus];
1361 unsigned short pci_device;
1362 u32 val;
1363
1364 val = read_pci_config(bus, 0, 0, 0);
1365 pci_device = (val & 0xFFFF0000) >> 16;
1366
1367 if (!is_cal_pci_dev(pci_device))
1368 continue;
1369 if (info->translation_disabled)
1370 continue;
1371
1372 if (calgary_bus_has_devices(bus, pci_device) ||
1373 translate_empty_slots) {
1374 target = calgary_reg(bus_info[bus].bbar,
1375 tar_offset(bus));
1376 tce_space = be64_to_cpu(readq(target));
1377 tce_space = tce_space & TAR_SW_BITS;
1378
1379 tce_space = tce_space & (~specified_table_size);
1380 info->tce_space = (u64 *)__va(tce_space);
1381 }
1382 }
1383 return;
1384}
1385
1342void __init detect_calgary(void) 1386void __init detect_calgary(void)
1343{ 1387{
1344 int bus; 1388 int bus;
@@ -1394,7 +1438,8 @@ void __init detect_calgary(void)
1394 return; 1438 return;
1395 } 1439 }
1396 1440
1397 specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); 1441 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1442 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1443
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1444 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1445 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1457,16 @@ void __init detect_calgary(void)
1412 1457
1413 if (calgary_bus_has_devices(bus, pci_device) || 1458 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1459 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1460 /*
1416 if (!tbl) 1461 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1462 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1463 */
1464 if (!is_kdump_kernel()) {
1465 tbl = alloc_tce_table();
1466 if (!tbl)
1467 goto cleanup;
1468 info->tce_space = tbl;
1469 }
1419 calgary_found = 1; 1470 calgary_found = 1;
1420 } 1471 }
1421 } 1472 }
@@ -1430,6 +1481,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1481 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1482 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1483 debugging ? "enabled" : "disabled");
1484
1485 /* swiotlb for devices that aren't behind the Calgary. */
1486 if (max_pfn > MAX_DMA32_PFN)
1487 swiotlb = 1;
1433 } 1488 }
1434 return; 1489 return;
1435 1490
@@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void)
1446{ 1501{
1447 int ret; 1502 int ret;
1448 1503
1449 if (no_iommu || swiotlb) 1504 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1505 return -ENODEV;
1451 1506
1452 if (!calgary_detected) 1507 if (!calgary_detected)
@@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1514 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1515 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1516 "falling back to no_iommu\n", ret);
1462 if (max_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1517 return ret;
1466 } 1518 }
1467 1519
1468 force_iommu = 1; 1520 force_iommu = 1;
1469 bad_dma_address = 0x0; 1521 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1522 /* dma_ops is set to swiotlb or nommu */
1523 if (!dma_ops)
1524 dma_ops = &nommu_dma_ops;
1471 1525
1472 return 0; 1526 return 0;
1473} 1527}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index a4213c00dffc..87d4d6964ec2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,7 +11,7 @@
11 11
12static int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13 13
14const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
15EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
16 16
17static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -123,6 +123,14 @@ void __init pci_iommu_alloc(void)
123 123
124 pci_swiotlb_init(); 124 pci_swiotlb_init();
125} 125}
126
127unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
128{
129 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
130
131 return size >> PAGE_SHIFT;
132}
133EXPORT_SYMBOL(iommu_num_pages);
126#endif 134#endif
127 135
128/* 136/*
@@ -192,136 +200,19 @@ static __init int iommu_setup(char *p)
192} 200}
193early_param("iommu", iommu_setup); 201early_param("iommu", iommu_setup);
194 202
195#ifdef CONFIG_X86_32
196int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
197 dma_addr_t device_addr, size_t size, int flags)
198{
199 void __iomem *mem_base = NULL;
200 int pages = size >> PAGE_SHIFT;
201 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
202
203 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
204 goto out;
205 if (!size)
206 goto out;
207 if (dev->dma_mem)
208 goto out;
209
210 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
211
212 mem_base = ioremap(bus_addr, size);
213 if (!mem_base)
214 goto out;
215
216 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
217 if (!dev->dma_mem)
218 goto out;
219 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
220 if (!dev->dma_mem->bitmap)
221 goto free1_out;
222
223 dev->dma_mem->virt_base = mem_base;
224 dev->dma_mem->device_base = device_addr;
225 dev->dma_mem->size = pages;
226 dev->dma_mem->flags = flags;
227
228 if (flags & DMA_MEMORY_MAP)
229 return DMA_MEMORY_MAP;
230
231 return DMA_MEMORY_IO;
232
233 free1_out:
234 kfree(dev->dma_mem);
235 out:
236 if (mem_base)
237 iounmap(mem_base);
238 return 0;
239}
240EXPORT_SYMBOL(dma_declare_coherent_memory);
241
242void dma_release_declared_memory(struct device *dev)
243{
244 struct dma_coherent_mem *mem = dev->dma_mem;
245
246 if (!mem)
247 return;
248 dev->dma_mem = NULL;
249 iounmap(mem->virt_base);
250 kfree(mem->bitmap);
251 kfree(mem);
252}
253EXPORT_SYMBOL(dma_release_declared_memory);
254
255void *dma_mark_declared_memory_occupied(struct device *dev,
256 dma_addr_t device_addr, size_t size)
257{
258 struct dma_coherent_mem *mem = dev->dma_mem;
259 int pos, err;
260 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
261
262 pages >>= PAGE_SHIFT;
263
264 if (!mem)
265 return ERR_PTR(-EINVAL);
266
267 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
268 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
269 if (err != 0)
270 return ERR_PTR(err);
271 return mem->virt_base + (pos << PAGE_SHIFT);
272}
273EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
274
275static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
276 dma_addr_t *dma_handle, void **ret)
277{
278 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
279 int order = get_order(size);
280
281 if (mem) {
282 int page = bitmap_find_free_region(mem->bitmap, mem->size,
283 order);
284 if (page >= 0) {
285 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
286 *ret = mem->virt_base + (page << PAGE_SHIFT);
287 memset(*ret, 0, size);
288 }
289 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
290 *ret = NULL;
291 }
292 return (mem != NULL);
293}
294
295static int dma_release_coherent(struct device *dev, int order, void *vaddr)
296{
297 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
298
299 if (mem && vaddr >= mem->virt_base && vaddr <
300 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
301 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
302
303 bitmap_release_region(mem->bitmap, page, order);
304 return 1;
305 }
306 return 0;
307}
308#else
309#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
310#define dma_release_coherent(dev, order, vaddr) (0)
311#endif /* CONFIG_X86_32 */
312
313int dma_supported(struct device *dev, u64 mask) 203int dma_supported(struct device *dev, u64 mask)
314{ 204{
205 struct dma_mapping_ops *ops = get_dma_ops(dev);
206
315#ifdef CONFIG_PCI 207#ifdef CONFIG_PCI
316 if (mask > 0xffffffff && forbid_dac > 0) { 208 if (mask > 0xffffffff && forbid_dac > 0) {
317 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 209 dev_info(dev, "PCI: Disallowing DAC for device\n");
318 dev->bus_id);
319 return 0; 210 return 0;
320 } 211 }
321#endif 212#endif
322 213
323 if (dma_ops->dma_supported) 214 if (ops->dma_supported)
324 return dma_ops->dma_supported(dev, mask); 215 return ops->dma_supported(dev, mask);
325 216
326 /* Copied from i386. Doesn't make much sense, because it will 217 /* Copied from i386. Doesn't make much sense, because it will
327 only work for pci_alloc_coherent. 218 only work for pci_alloc_coherent.
@@ -342,8 +233,7 @@ int dma_supported(struct device *dev, u64 mask)
342 type. Normally this doesn't make any difference, but gives 233 type. Normally this doesn't make any difference, but gives
343 more gentle handling of IOMMU overflow. */ 234 more gentle handling of IOMMU overflow. */
344 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 235 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
345 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 236 dev_info(dev, "Force SAC with mask %Lx\n", mask);
346 dev->bus_id, mask);
347 return 0; 237 return 0;
348 } 238 }
349 239
@@ -369,6 +259,7 @@ void *
369dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, 259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
370 gfp_t gfp) 260 gfp_t gfp)
371{ 261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
372 void *memory = NULL; 263 void *memory = NULL;
373 struct page *page; 264 struct page *page;
374 unsigned long dma_mask = 0; 265 unsigned long dma_mask = 0;
@@ -378,7 +269,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
378 /* ignore region specifiers */ 269 /* ignore region specifiers */
379 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
380 271
381 if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) 272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
382 return memory; 273 return memory;
383 274
384 if (!dev) { 275 if (!dev) {
@@ -437,8 +328,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
437 /* Let low level make its own zone decisions */ 328 /* Let low level make its own zone decisions */
438 gfp &= ~(GFP_DMA32|GFP_DMA); 329 gfp &= ~(GFP_DMA32|GFP_DMA);
439 330
440 if (dma_ops->alloc_coherent) 331 if (ops->alloc_coherent)
441 return dma_ops->alloc_coherent(dev, size, 332 return ops->alloc_coherent(dev, size,
442 dma_handle, gfp); 333 dma_handle, gfp);
443 return NULL; 334 return NULL;
444 } 335 }
@@ -450,14 +341,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
450 } 341 }
451 } 342 }
452 343
453 if (dma_ops->alloc_coherent) { 344 if (ops->alloc_coherent) {
454 free_pages((unsigned long)memory, get_order(size)); 345 free_pages((unsigned long)memory, get_order(size));
455 gfp &= ~(GFP_DMA|GFP_DMA32); 346 gfp &= ~(GFP_DMA|GFP_DMA32);
456 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); 347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
457 } 348 }
458 349
459 if (dma_ops->map_simple) { 350 if (ops->map_simple) {
460 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), 351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
461 size, 352 size,
462 PCI_DMA_BIDIRECTIONAL); 353 PCI_DMA_BIDIRECTIONAL);
463 if (*dma_handle != bad_dma_address) 354 if (*dma_handle != bad_dma_address)
@@ -479,12 +370,14 @@ EXPORT_SYMBOL(dma_alloc_coherent);
479void dma_free_coherent(struct device *dev, size_t size, 370void dma_free_coherent(struct device *dev, size_t size,
480 void *vaddr, dma_addr_t bus) 371 void *vaddr, dma_addr_t bus)
481{ 372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
482 int order = get_order(size); 375 int order = get_order(size);
483 WARN_ON(irqs_disabled()); /* for portability */ 376 WARN_ON(irqs_disabled()); /* for portability */
484 if (dma_release_coherent(dev, order, vaddr)) 377 if (dma_release_from_coherent(dev, order, vaddr))
485 return; 378 return;
486 if (dma_ops->unmap_single) 379 if (ops->unmap_single)
487 dma_ops->unmap_single(dev, bus, size, 0); 380 ops->unmap_single(dev, bus, size, 0);
488 free_pages((unsigned long)vaddr, order); 381 free_pages((unsigned long)vaddr, order);
489} 382}
490EXPORT_SYMBOL(dma_free_coherent); 383EXPORT_SYMBOL(dma_free_coherent);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index be60961f8695..49285f8fd4d5 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -67,9 +67,6 @@ static u32 gart_unmapped_entry;
67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
69 69
70#define to_pages(addr, size) \
71 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
72
73#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
74 71
75#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
@@ -198,9 +195,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
198 * out. Hopefully no network devices use single mappings that big. 195 * out. Hopefully no network devices use single mappings that big.
199 */ 196 */
200 197
201 printk(KERN_ERR 198 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
202 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
203 size, dev->bus_id);
204 199
205 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 200 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
206 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 201 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -243,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
243static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
244 size_t size, int dir) 239 size_t size, int dir)
245{ 240{
246 unsigned long npages = to_pages(phys_mem, size); 241 unsigned long npages = iommu_num_pages(phys_mem, size);
247 unsigned long iommu_page = alloc_iommu(dev, npages); 242 unsigned long iommu_page = alloc_iommu(dev, npages);
248 int i; 243 int i;
249 244
@@ -306,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
306 return; 301 return;
307 302
308 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 303 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
309 npages = to_pages(dma_addr, size); 304 npages = iommu_num_pages(dma_addr, size);
310 for (i = 0; i < npages; i++) { 305 for (i = 0; i < npages; i++) {
311 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 306 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
312 CLEAR_LEAK(iommu_page + i); 307 CLEAR_LEAK(iommu_page + i);
@@ -389,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
389 } 384 }
390 385
391 addr = phys_addr; 386 addr = phys_addr;
392 pages = to_pages(s->offset, s->length); 387 pages = iommu_num_pages(s->offset, s->length);
393 while (pages--) { 388 while (pages--) {
394 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 389 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
395 SET_LEAK(iommu_page); 390 SET_LEAK(iommu_page);
@@ -472,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
472 467
473 seg_size += s->length; 468 seg_size += s->length;
474 need = nextneed; 469 need = nextneed;
475 pages += to_pages(s->offset, s->length); 470 pages += iommu_num_pages(s->offset, s->length);
476 ps = s; 471 ps = s;
477 } 472 }
478 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 473 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -694,8 +689,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
694 689
695extern int agp_amd64_init(void); 690extern int agp_amd64_init(void);
696 691
697static const struct dma_mapping_ops gart_dma_ops = { 692static struct dma_mapping_ops gart_dma_ops = {
698 .mapping_error = NULL,
699 .map_single = gart_map_single, 693 .map_single = gart_map_single,
700 .map_simple = gart_map_simple, 694 .map_simple = gart_map_simple,
701 .unmap_single = gart_unmap_single, 695 .unmap_single = gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 792b9179eff3..3f91f71cdc3e 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75struct dma_mapping_ops nommu_dma_ops = {
76static int nommu_mapping_error(dma_addr_t dma_addr)
77{
78#ifdef CONFIG_X86_32
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85
86const struct dma_mapping_ops nommu_dma_ops = {
87 .map_single = nommu_map_single, 76 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 77 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 78 .is_phys = 1,
91}; 79};
92 80
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 20df839b9c20..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7218bccd1766..7b6e44a7c624 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -130,7 +130,7 @@ void cpu_idle(void)
130 130
131 /* endless idle loop with no priority at all */ 131 /* endless idle loop with no priority at all */
132 while (1) { 132 while (1) {
133 tick_nohz_stop_sched_tick(); 133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) { 134 while (!need_resched()) {
135 135
136 check_pgt_cache(); 136 check_pgt_cache();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c78090dd0c52..87d7dfdcf46c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,7 +121,7 @@ void cpu_idle(void)
121 current_thread_info()->status |= TS_POLLING; 121 current_thread_info()->status |= TS_POLLING;
122 /* endless idle loop with no priority at all */ 122 /* endless idle loop with no priority at all */
123 while (1) { 123 while (1) {
124 tick_nohz_stop_sched_tick(); 124 tick_nohz_stop_sched_tick(1);
125 while (!need_resched()) { 125 while (!need_resched()) {
126 126
127 rmb(); 127 rmb();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9dcf39c02972..724adfc63cb9 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -411,10 +411,9 @@ void native_machine_shutdown(void)
411{ 411{
412 /* Stop the cpus and apics */ 412 /* Stop the cpus and apics */
413#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
414 int reboot_cpu_id;
415 414
416 /* The boot cpu is always logical cpu 0 */ 415 /* The boot cpu is always logical cpu 0 */
417 reboot_cpu_id = 0; 416 int reboot_cpu_id = 0;
418 417
419#ifdef CONFIG_X86_32 418#ifdef CONFIG_X86_32
420 /* See if there has been given a command line override */ 419 /* See if there has been given a command line override */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..703310a99023 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
24 * used to save some data for jumping back
25 */
26#define DATA(offset) (PAGE_SIZE/2+(offset))
27
28/* Minimal CPU state */
29#define ESP DATA(0x0)
30#define CR0 DATA(0x4)
31#define CR3 DATA(0x8)
32#define CR4 DATA(0xc)
33
34/* other data */
35#define CP_VA_CONTROL_PAGE DATA(0x10)
36#define CP_PA_PGD DATA(0x14)
37#define CP_PA_SWAP_PAGE DATA(0x18)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
39
23 .text 40 .text
24 .align PAGE_SIZE 41 .align PAGE_SIZE
25 .globl relocate_kernel 42 .globl relocate_kernel
26relocate_kernel: 43relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 44 /* Save the CPU context, used for jumping back */
45
46 pushl %ebx
47 pushl %esi
48 pushl %edi
49 pushl %ebp
50 pushf
51
52 movl 20+8(%esp), %ebp /* list of pages */
53 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
54 movl %esp, ESP(%edi)
55 movl %cr0, %eax
56 movl %eax, CR0(%edi)
57 movl %cr3, %eax
58 movl %eax, CR3(%edi)
59 movl %cr4, %eax
60 movl %eax, CR4(%edi)
28 61
29#ifdef CONFIG_X86_PAE 62#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 63 /* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
138 171
139relocate_new_kernel: 172relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 173 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 174 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 175 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 176 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 177 movl 20+16(%esp), %ecx /* cpu_has_pae */
178 movl 20+20(%esp), %esi /* preserve_context */
145 179
146 /* zero out flags, and disable interrupts */ 180 /* zero out flags, and disable interrupts */
147 pushl $0 181 pushl $0
148 popfl 182 popfl
149 183
184 /* save some information for jumping back */
185 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
186 movl %edi, CP_VA_CONTROL_PAGE(%edi)
187 movl PTR(PA_PGD)(%ebp), %eax
188 movl %eax, CP_PA_PGD(%edi)
189 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
190 movl %eax, CP_PA_SWAP_PAGE(%edi)
191 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
192
150 /* get physical address of control page now */ 193 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 194 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 195 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
197 xorl %eax, %eax 240 xorl %eax, %eax
198 movl %eax, %cr3 241 movl %eax, %cr3
199 242
243 movl CP_PA_SWAP_PAGE(%edi), %eax
244 pushl %eax
245 pushl %ebx
246 call swap_pages
247 addl $8, %esp
248
249 /* To be certain of avoiding problems with self-modifying code
250 * I need to execute a serializing instruction here.
251 * So I flush the TLB, it's handy, and not processor dependent.
252 */
253 xorl %eax, %eax
254 movl %eax, %cr3
255
256 /* set all of the registers to known values */
257 /* leave %esp alone */
258
259 testl %esi, %esi
260 jnz 1f
261 xorl %edi, %edi
262 xorl %eax, %eax
263 xorl %ebx, %ebx
264 xorl %ecx, %ecx
265 xorl %edx, %edx
266 xorl %esi, %esi
267 xorl %ebp, %ebp
268 ret
2691:
270 popl %edx
271 movl CP_PA_SWAP_PAGE(%edi), %esp
272 addl $PAGE_SIZE, %esp
2732:
274 call *%edx
275
276 /* get the re-entry point of the peer system */
277 movl 0(%esp), %ebp
278 call 1f
2791:
280 popl %ebx
281 subl $(1b - relocate_kernel), %ebx
282 movl CP_VA_CONTROL_PAGE(%ebx), %edi
283 lea PAGE_SIZE(%ebx), %esp
284 movl CP_PA_SWAP_PAGE(%ebx), %eax
285 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
286 pushl %eax
287 pushl %edx
288 call swap_pages
289 addl $8, %esp
290 movl CP_PA_PGD(%ebx), %eax
291 movl %eax, %cr3
292 movl %cr0, %eax
293 orl $(1<<31), %eax
294 movl %eax, %cr0
295 lea PAGE_SIZE(%edi), %esp
296 movl %edi, %eax
297 addl $(virtual_mapped - relocate_kernel), %eax
298 pushl %eax
299 ret
300
301virtual_mapped:
302 movl CR4(%edi), %eax
303 movl %eax, %cr4
304 movl CR3(%edi), %eax
305 movl %eax, %cr3
306 movl CR0(%edi), %eax
307 movl %eax, %cr0
308 movl ESP(%edi), %esp
309 movl %ebp, %eax
310
311 popf
312 popl %ebp
313 popl %edi
314 popl %esi
315 popl %ebx
316 ret
317
200 /* Do the copies */ 318 /* Do the copies */
201 movl %ebx, %ecx 319swap_pages:
320 movl 8(%esp), %edx
321 movl 4(%esp), %ecx
322 pushl %ebp
323 pushl %ebx
324 pushl %edi
325 pushl %esi
326 movl %ecx, %ebx
202 jmp 1f 327 jmp 1f
203 328
2040: /* top, read another word from the indirection page */ 3290: /* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 351 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 352 andl $0xfffff000, %esi
228 353
354 movl %edi, %eax
355 movl %esi, %ebp
356
357 movl %edx, %edi
229 movl $1024, %ecx 358 movl $1024, %ecx
230 rep ; movsl 359 rep ; movsl
231 jmp 0b
232 360
2333: 361 movl %ebp, %edi
234 362 movl %eax, %esi
235 /* To be certain of avoiding problems with self-modifying code 363 movl $1024, %ecx
236 * I need to execute a serializing instruction here. 364 rep ; movsl
237 * So I flush the TLB, it's handy, and not processor dependent.
238 */
239 xorl %eax, %eax
240 movl %eax, %cr3
241 365
242 /* set all of the registers to known values */ 366 movl %eax, %edi
243 /* leave %esp alone */ 367 movl %edx, %esi
368 movl $1024, %ecx
369 rep ; movsl
244 370
245 xorl %eax, %eax 371 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 372 jmp 0b
247 xorl %ecx, %ecx 3733:
248 xorl %edx, %edx 374 popl %esi
249 xorl %esi, %esi 375 popl %edi
250 xorl %edi, %edi 376 popl %ebx
251 xorl %ebp, %ebp 377 popl %ebp
252 ret 378 ret
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ec952aa5394a..68b48e3fbcbd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -597,13 +597,21 @@ void __init setup_arch(char **cmdline_p)
597 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 597 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
598 visws_early_detect(); 598 visws_early_detect();
599 pre_setup_arch_hook(); 599 pre_setup_arch_hook();
600 early_cpu_init();
601#else 600#else
602 printk(KERN_INFO "Command line: %s\n", boot_command_line); 601 printk(KERN_INFO "Command line: %s\n", boot_command_line);
603#endif 602#endif
604 603
604 early_cpu_init();
605 early_ioremap_init(); 605 early_ioremap_init();
606 606
607#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
608 /*
609 * Must be before kernel pagetables are setup
610 * or fixmap area is touched.
611 */
612 vmi_init();
613#endif
614
607 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 615 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
608 screen_info = boot_params.screen_info; 616 screen_info = boot_params.screen_info;
609 edid_info = boot_params.edid_info; 617 edid_info = boot_params.edid_info;
@@ -665,9 +673,6 @@ void __init setup_arch(char **cmdline_p)
665 bss_resource.start = virt_to_phys(&__bss_start); 673 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1; 674 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667 675
668#ifdef CONFIG_X86_64
669 early_cpu_init();
670#endif
671 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 676 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
672 *cmdline_p = command_line; 677 *cmdline_p = command_line;
673 678
@@ -680,7 +685,7 @@ void __init setup_arch(char **cmdline_p)
680#ifdef CONFIG_X86_LOCAL_APIC 685#ifdef CONFIG_X86_LOCAL_APIC
681 disable_apic = 1; 686 disable_apic = 1;
682#endif 687#endif
683 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 688 setup_clear_cpu_cap(X86_FEATURE_APIC);
684 } 689 }
685 690
686#ifdef CONFIG_PCI 691#ifdef CONFIG_PCI
@@ -791,10 +796,6 @@ void __init setup_arch(char **cmdline_p)
791 796
792 initmem_init(0, max_pfn); 797 initmem_init(0, max_pfn);
793 798
794#ifdef CONFIG_X86_64
795 dma32_reserve_bootmem();
796#endif
797
798#ifdef CONFIG_ACPI_SLEEP 799#ifdef CONFIG_ACPI_SLEEP
799 /* 800 /*
800 * Reserve low memory region for sleep support. 801 * Reserve low memory region for sleep support.
@@ -809,20 +810,21 @@ void __init setup_arch(char **cmdline_p)
809#endif 810#endif
810 reserve_crashkernel(); 811 reserve_crashkernel();
811 812
813#ifdef CONFIG_X86_64
814 /*
815 * dma32_reserve_bootmem() allocates bootmem which may conflict
816 * with the crashkernel command line, so do that after
817 * reserve_crashkernel()
818 */
819 dma32_reserve_bootmem();
820#endif
821
812 reserve_ibft_region(); 822 reserve_ibft_region();
813 823
814#ifdef CONFIG_KVM_CLOCK 824#ifdef CONFIG_KVM_CLOCK
815 kvmclock_init(); 825 kvmclock_init();
816#endif 826#endif
817 827
818#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
819 /*
820 * Must be after max_low_pfn is determined, and before kernel
821 * pagetables are setup.
822 */
823 vmi_init();
824#endif
825
826 paravirt_pagetable_setup_start(swapper_pg_dir); 828 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 829 paging_init();
828 paravirt_pagetable_setup_done(swapper_pg_dir); 830 paravirt_pagetable_setup_done(swapper_pg_dir);
@@ -859,12 +861,6 @@ void __init setup_arch(char **cmdline_p)
859 init_apic_mappings(); 861 init_apic_mappings();
860 ioapic_init_mappings(); 862 ioapic_init_mappings();
861 863
862#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
863 if (def_to_bigsmp)
864 printk(KERN_WARNING "More than 8 CPUs detected and "
865 "CONFIG_X86_PC cannot handle it.\nUse "
866 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
867#endif
868 kvm_guest_init(); 864 kvm_guest_init();
869 865
870 e820_reserve_resources(); 866 e820_reserve_resources();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index f7745f94c006..76e305e064f9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
80#endif 80#endif
81} 81}
82 82
83#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
84cpumask_t *cpumask_of_cpu_map __read_mostly;
85EXPORT_SYMBOL(cpumask_of_cpu_map);
86
87/* requires nr_cpu_ids to be initialized */
88static void __init setup_cpumask_of_cpu(void)
89{
90 int i;
91
92 /* alloc_bootmem zeroes memory */
93 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
94 for (i = 0; i < nr_cpu_ids; i++)
95 cpu_set(i, cpumask_of_cpu_map[i]);
96}
97#else
98static inline void setup_cpumask_of_cpu(void) { }
99#endif
100
101#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
102/* 84/*
103 * Great future not-so-futuristic plan: make i386 and x86_64 do it 85 * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -197,9 +179,6 @@ void __init setup_per_cpu_areas(void)
197 179
198 /* Setup node to cpumask map */ 180 /* Setup node to cpumask map */
199 setup_node_to_cpumask_map(); 181 setup_node_to_cpumask_map();
200
201 /* Setup cpumask_of_cpu map */
202 setup_cpumask_of_cpu();
203} 182}
204 183
205#endif 184#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 5cede1045ce7..0c727f64e79b 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -662,8 +662,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
662 if (thread_info_flags & _TIF_SIGPENDING) 662 if (thread_info_flags & _TIF_SIGPENDING)
663 do_signal(regs); 663 do_signal(regs);
664 664
665 if (thread_info_flags & _TIF_HRTICK_RESCHED)
666 hrtick_resched();
667
668 clear_thread_flag(TIF_IRET); 665 clear_thread_flag(TIF_IRET);
669} 666}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index b95a0a609053..02b02583f5b5 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -54,6 +54,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
54 return do_sigaltstack(uss, uoss, regs->sp); 54 return do_sigaltstack(uss, uoss, regs->sp);
55} 55}
56 56
57/*
58 * Signal frame handlers.
59 */
60
61static inline int save_i387(struct _fpstate __user *buf)
62{
63 struct task_struct *tsk = current;
64 int err = 0;
65
66 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
67 sizeof(tsk->thread.xstate->fxsave));
68
69 if ((unsigned long)buf % 16)
70 printk("save_i387: bad fpstate %p\n", buf);
71
72 if (!used_math())
73 return 0;
74 clear_used_math(); /* trigger finit */
75 if (task_thread_info(tsk)->status & TS_USEDFPU) {
76 err = save_i387_checking((struct i387_fxsave_struct __user *)
77 buf);
78 if (err)
79 return err;
80 task_thread_info(tsk)->status &= ~TS_USEDFPU;
81 stts();
82 } else {
83 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
84 sizeof(struct i387_fxsave_struct)))
85 return -1;
86 }
87 return 1;
88}
89
90/*
91 * This restores directly out of user space. Exceptions are handled.
92 */
93static inline int restore_i387(struct _fpstate __user *buf)
94{
95 struct task_struct *tsk = current;
96 int err;
97
98 if (!used_math()) {
99 err = init_fpu(tsk);
100 if (err)
101 return err;
102 }
103
104 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
105 clts();
106 task_thread_info(current)->status |= TS_USEDFPU;
107 }
108 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
109}
57 110
58/* 111/*
59 * Do a signal return; undo the signal stack. 112 * Do a signal return; undo the signal stack.
@@ -497,9 +550,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
497 /* deal with pending signal delivery */ 550 /* deal with pending signal delivery */
498 if (thread_info_flags & _TIF_SIGPENDING) 551 if (thread_info_flags & _TIF_SIGPENDING)
499 do_signal(regs); 552 do_signal(regs);
500
501 if (thread_info_flags & _TIF_HRTICK_RESCHED)
502 hrtick_resched();
503} 553}
504 554
505void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 555void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a9331a42f76f..6112c3632345 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -326,12 +326,16 @@ static void __cpuinit start_secondary(void *unused)
326 * for which cpus receive the IPI. Holding this 326 * for which cpus receive the IPI. Holding this
327 * lock helps us to not include this cpu in a currently in progress 327 * lock helps us to not include this cpu in a currently in progress
328 * smp_call_function(). 328 * smp_call_function().
329 *
330 * We need to hold vector_lock so there the set of online cpus
331 * does not change while we are assigning vectors to cpus. Holding
332 * this lock ensures we don't half assign or remove an irq from a cpu.
329 */ 333 */
330 ipi_call_lock_irq(); 334 ipi_call_lock_irq();
331#ifdef CONFIG_X86_IO_APIC 335 lock_vector_lock();
332 setup_vector_irq(smp_processor_id()); 336 __setup_vector_irq(smp_processor_id());
333#endif
334 cpu_set(smp_processor_id(), cpu_online_map); 337 cpu_set(smp_processor_id(), cpu_online_map);
338 unlock_vector_lock();
335 ipi_call_unlock_irq(); 339 ipi_call_unlock_irq();
336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 340 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
337 341
@@ -438,7 +442,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
438 cpu_set(cpu, cpu_sibling_setup_map); 442 cpu_set(cpu, cpu_sibling_setup_map);
439 443
440 if (smp_num_siblings > 1) { 444 if (smp_num_siblings > 1) {
441 for_each_cpu_mask(i, cpu_sibling_setup_map) { 445 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 446 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
443 c->cpu_core_id == cpu_data(i).cpu_core_id) { 447 c->cpu_core_id == cpu_data(i).cpu_core_id) {
444 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 448 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +465,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
461 return; 465 return;
462 } 466 }
463 467
464 for_each_cpu_mask(i, cpu_sibling_setup_map) { 468 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 469 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 470 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
467 cpu_set(i, c->llc_shared_map); 471 cpu_set(i, c->llc_shared_map);
@@ -990,7 +994,17 @@ int __cpuinit native_cpu_up(unsigned int cpu)
990 flush_tlb_all(); 994 flush_tlb_all();
991 low_mappings = 1; 995 low_mappings = 1;
992 996
997#ifdef CONFIG_X86_PC
998 if (def_to_bigsmp && apicid > 8) {
999 printk(KERN_WARNING
1000 "More than 8 CPUs detected - skipping them.\n"
1001 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1002 err = -1;
1003 } else
1004 err = do_boot_cpu(apicid, cpu);
1005#else
993 err = do_boot_cpu(apicid, cpu); 1006 err = do_boot_cpu(apicid, cpu);
1007#endif
994 1008
995 zap_low_mappings(); 1009 zap_low_mappings();
996 low_mappings = 0; 1010 low_mappings = 0;
@@ -1219,7 +1233,7 @@ static void remove_siblinginfo(int cpu)
1219 int sibling; 1233 int sibling;
1220 struct cpuinfo_x86 *c = &cpu_data(cpu); 1234 struct cpuinfo_x86 *c = &cpu_data(cpu);
1221 1235
1222 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { 1236 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1223 cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); 1237 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1224 /*/ 1238 /*/
1225 * last thread sibling in this cpu core going down 1239 * last thread sibling in this cpu core going down
@@ -1228,7 +1242,7 @@ static void remove_siblinginfo(int cpu)
1228 cpu_data(sibling).booted_cores--; 1242 cpu_data(sibling).booted_cores--;
1229 } 1243 }
1230 1244
1231 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) 1245 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1232 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); 1246 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1233 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1247 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1234 cpus_clear(per_cpu(cpu_core_map, cpu)); 1248 cpus_clear(per_cpu(cpu_core_map, cpu));
@@ -1336,7 +1350,9 @@ int __cpu_disable(void)
1336 remove_siblinginfo(cpu); 1350 remove_siblinginfo(cpu);
1337 1351
1338 /* It's now safe to remove this processor from the online map */ 1352 /* It's now safe to remove this processor from the online map */
1353 lock_vector_lock();
1339 remove_cpu_from_maps(cpu); 1354 remove_cpu_from_maps(cpu);
1355 unlock_vector_lock();
1340 fixup_irqs(cpu_online_map); 1356 fixup_irqs(cpu_online_map);
1341 return 0; 1357 return 0;
1342} 1358}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 0a1b1a9d922d..6ca515d6db54 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
37#include <asm/timer.h> 37#include <asm/timer.h>
38#include <asm/vmi_time.h> 38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/setup.h>
40 41
41/* Convenient for calling VMI functions indirectly in the ROM */ 42/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 43typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
683{ 684{
684 /* We must establish the lowmem mapping for MMU ops to work */ 685 /* We must establish the lowmem mapping for MMU ops to work */
685 if (vmi_ops.set_linear_mapping) 686 if (vmi_ops.set_linear_mapping)
686 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); 687 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
687} 688}
688 689
689/* 690/*
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fabc5f3b..ce3251ce5504 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select MMU_NOTIFIER
24 select ANON_INODES 25 select ANON_INODES
25 ---help--- 26 ---help---
26 Support hosting fully virtualized guest machines using hardware 27 Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b0e4ddca6c18..0bfe2bd305eb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
653 account_shadowed(kvm, gfn); 653 account_shadowed(kvm, gfn);
654} 654}
655 655
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
657{
658 u64 *spte;
659 int need_tlb_flush = 0;
660
661 while ((spte = rmap_next(kvm, rmapp, NULL))) {
662 BUG_ON(!(*spte & PT_PRESENT_MASK));
663 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
664 rmap_remove(kvm, spte);
665 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
666 need_tlb_flush = 1;
667 }
668 return need_tlb_flush;
669}
670
671static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
672 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
673{
674 int i;
675 int retval = 0;
676
677 /*
678 * If mmap_sem isn't taken, we can look the memslots with only
679 * the mmu_lock by skipping over the slots with userspace_addr == 0.
680 */
681 for (i = 0; i < kvm->nmemslots; i++) {
682 struct kvm_memory_slot *memslot = &kvm->memslots[i];
683 unsigned long start = memslot->userspace_addr;
684 unsigned long end;
685
686 /* mmu_lock protects userspace_addr */
687 if (!start)
688 continue;
689
690 end = start + (memslot->npages << PAGE_SHIFT);
691 if (hva >= start && hva < end) {
692 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
693 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
694 retval |= handler(kvm,
695 &memslot->lpage_info[
696 gfn_offset /
697 KVM_PAGES_PER_HPAGE].rmap_pde);
698 }
699 }
700
701 return retval;
702}
703
704int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
705{
706 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
707}
708
709static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
710{
711 u64 *spte;
712 int young = 0;
713
714 spte = rmap_next(kvm, rmapp, NULL);
715 while (spte) {
716 int _young;
717 u64 _spte = *spte;
718 BUG_ON(!(_spte & PT_PRESENT_MASK));
719 _young = _spte & PT_ACCESSED_MASK;
720 if (_young) {
721 young = 1;
722 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
723 }
724 spte = rmap_next(kvm, rmapp, spte);
725 }
726 return young;
727}
728
729int kvm_age_hva(struct kvm *kvm, unsigned long hva)
730{
731 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
732}
733
656#ifdef MMU_DEBUG 734#ifdef MMU_DEBUG
657static int is_empty_shadow_page(u64 *spt) 735static int is_empty_shadow_page(u64 *spt)
658{ 736{
@@ -1203,6 +1281,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1203 int r; 1281 int r;
1204 int largepage = 0; 1282 int largepage = 0;
1205 pfn_t pfn; 1283 pfn_t pfn;
1284 unsigned long mmu_seq;
1206 1285
1207 down_read(&current->mm->mmap_sem); 1286 down_read(&current->mm->mmap_sem);
1208 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1287 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1210,6 +1289,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1210 largepage = 1; 1289 largepage = 1;
1211 } 1290 }
1212 1291
1292 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1293 /* implicit mb(), we'll read before PT lock is unlocked */
1213 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1294 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1214 up_read(&current->mm->mmap_sem); 1295 up_read(&current->mm->mmap_sem);
1215 1296
@@ -1220,6 +1301,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1220 } 1301 }
1221 1302
1222 spin_lock(&vcpu->kvm->mmu_lock); 1303 spin_lock(&vcpu->kvm->mmu_lock);
1304 if (mmu_notifier_retry(vcpu, mmu_seq))
1305 goto out_unlock;
1223 kvm_mmu_free_some_pages(vcpu); 1306 kvm_mmu_free_some_pages(vcpu);
1224 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1307 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1225 PT32E_ROOT_LEVEL); 1308 PT32E_ROOT_LEVEL);
@@ -1227,6 +1310,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1227 1310
1228 1311
1229 return r; 1312 return r;
1313
1314out_unlock:
1315 spin_unlock(&vcpu->kvm->mmu_lock);
1316 kvm_release_pfn_clean(pfn);
1317 return 0;
1230} 1318}
1231 1319
1232 1320
@@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1345 int r; 1433 int r;
1346 int largepage = 0; 1434 int largepage = 0;
1347 gfn_t gfn = gpa >> PAGE_SHIFT; 1435 gfn_t gfn = gpa >> PAGE_SHIFT;
1436 unsigned long mmu_seq;
1348 1437
1349 ASSERT(vcpu); 1438 ASSERT(vcpu);
1350 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1439 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1358,6 +1447,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1358 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1447 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1359 largepage = 1; 1448 largepage = 1;
1360 } 1449 }
1450 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1451 /* implicit mb(), we'll read before PT lock is unlocked */
1361 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1452 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1362 up_read(&current->mm->mmap_sem); 1453 up_read(&current->mm->mmap_sem);
1363 if (is_error_pfn(pfn)) { 1454 if (is_error_pfn(pfn)) {
@@ -1365,12 +1456,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1365 return 1; 1456 return 1;
1366 } 1457 }
1367 spin_lock(&vcpu->kvm->mmu_lock); 1458 spin_lock(&vcpu->kvm->mmu_lock);
1459 if (mmu_notifier_retry(vcpu, mmu_seq))
1460 goto out_unlock;
1368 kvm_mmu_free_some_pages(vcpu); 1461 kvm_mmu_free_some_pages(vcpu);
1369 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1462 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1370 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1463 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
1371 spin_unlock(&vcpu->kvm->mmu_lock); 1464 spin_unlock(&vcpu->kvm->mmu_lock);
1372 1465
1373 return r; 1466 return r;
1467
1468out_unlock:
1469 spin_unlock(&vcpu->kvm->mmu_lock);
1470 kvm_release_pfn_clean(pfn);
1471 return 0;
1374} 1472}
1375 1473
1376static void nonpaging_free(struct kvm_vcpu *vcpu) 1474static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1670 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1768 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1671 vcpu->arch.update_pte.largepage = 1; 1769 vcpu->arch.update_pte.largepage = 1;
1672 } 1770 }
1771 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1772 /* implicit mb(), we'll read before PT lock is unlocked */
1673 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1773 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1674 up_read(&current->mm->mmap_sem); 1774 up_read(&current->mm->mmap_sem);
1675 1775
@@ -1814,6 +1914,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1814 spin_unlock(&vcpu->kvm->mmu_lock); 1914 spin_unlock(&vcpu->kvm->mmu_lock);
1815 return r; 1915 return r;
1816} 1916}
1917EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1817 1918
1818void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 1919void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1819{ 1920{
@@ -1870,6 +1971,12 @@ void kvm_enable_tdp(void)
1870} 1971}
1871EXPORT_SYMBOL_GPL(kvm_enable_tdp); 1972EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1872 1973
1974void kvm_disable_tdp(void)
1975{
1976 tdp_enabled = false;
1977}
1978EXPORT_SYMBOL_GPL(kvm_disable_tdp);
1979
1873static void free_mmu_pages(struct kvm_vcpu *vcpu) 1980static void free_mmu_pages(struct kvm_vcpu *vcpu)
1874{ 1981{
1875 struct kvm_mmu_page *sp; 1982 struct kvm_mmu_page *sp;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d918220baeb..f72ac1fa35f0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 pfn = vcpu->arch.update_pte.pfn; 263 pfn = vcpu->arch.update_pte.pfn;
264 if (is_error_pfn(pfn)) 264 if (is_error_pfn(pfn))
265 return; 265 return;
266 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
267 return;
266 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 270 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 int r; 382 int r;
381 pfn_t pfn; 383 pfn_t pfn;
382 int largepage = 0; 384 int largepage = 0;
385 unsigned long mmu_seq;
383 386
384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 387 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 kvm_mmu_audit(vcpu, "pre page fault"); 388 kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 largepage = 1; 416 largepage = 1;
414 } 417 }
415 } 418 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 up_read(&current->mm->mmap_sem); 422 up_read(&current->mm->mmap_sem);
418 423
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
424 } 429 }
425 430
426 spin_lock(&vcpu->kvm->mmu_lock); 431 spin_lock(&vcpu->kvm->mmu_lock);
432 if (mmu_notifier_retry(vcpu, mmu_seq))
433 goto out_unlock;
427 kvm_mmu_free_some_pages(vcpu); 434 kvm_mmu_free_some_pages(vcpu);
428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 435 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 largepage, &write_pt, pfn); 436 largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 spin_unlock(&vcpu->kvm->mmu_lock); 446 spin_unlock(&vcpu->kvm->mmu_lock);
440 447
441 return write_pt; 448 return write_pt;
449
450out_unlock:
451 spin_unlock(&vcpu->kvm->mmu_lock);
452 kvm_release_pfn_clean(pfn);
453 return 0;
442} 454}
443 455
444static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b756e876dce3..e2ee264740c7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -453,7 +453,8 @@ static __init int svm_hardware_setup(void)
453 if (npt_enabled) { 453 if (npt_enabled) {
454 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 454 printk(KERN_INFO "kvm: Nested Paging enabled\n");
455 kvm_enable_tdp(); 455 kvm_enable_tdp();
456 } 456 } else
457 kvm_disable_tdp();
457 458
458 return 0; 459 return 0;
459 460
@@ -1007,10 +1008,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1007 struct kvm *kvm = svm->vcpu.kvm; 1008 struct kvm *kvm = svm->vcpu.kvm;
1008 u64 fault_address; 1009 u64 fault_address;
1009 u32 error_code; 1010 u32 error_code;
1011 bool event_injection = false;
1010 1012
1011 if (!irqchip_in_kernel(kvm) && 1013 if (!irqchip_in_kernel(kvm) &&
1012 is_external_interrupt(exit_int_info)) 1014 is_external_interrupt(exit_int_info)) {
1015 event_injection = true;
1013 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 1016 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1017 }
1014 1018
1015 fault_address = svm->vmcb->control.exit_info_2; 1019 fault_address = svm->vmcb->control.exit_info_2;
1016 error_code = svm->vmcb->control.exit_info_1; 1020 error_code = svm->vmcb->control.exit_info_1;
@@ -1024,6 +1028,8 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1024 (u32)fault_address, (u32)(fault_address >> 32), 1028 (u32)fault_address, (u32)(fault_address >> 32),
1025 handler); 1029 handler);
1026 1030
1031 if (event_injection)
1032 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1027 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1033 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1028} 1034}
1029 1035
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0cac63701719..2a69773e3b26 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2298,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2298 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2298 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2300 (u32)((u64)cr2 >> 32), handler); 2300 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2301 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2303 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2302 } 2304 }
2303 2305
@@ -3116,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3116 return ERR_PTR(-ENOMEM); 3118 return ERR_PTR(-ENOMEM);
3117 3119
3118 allocate_vpid(vmx); 3120 allocate_vpid(vmx);
3119 if (id == 0 && vm_need_ept()) {
3120 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3121 VMX_EPT_WRITABLE_MASK |
3122 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3123 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3124 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3125 VMX_EPT_EXECUTABLE_MASK);
3126 kvm_enable_tdp();
3127 }
3128 3121
3129 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 3122 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3130 if (err) 3123 if (err)
@@ -3303,8 +3296,17 @@ static int __init vmx_init(void)
3303 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3296 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3304 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3297 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3305 3298
3306 if (cpu_has_vmx_ept()) 3299 if (vm_need_ept()) {
3307 bypass_guest_pf = 0; 3300 bypass_guest_pf = 0;
3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3302 VMX_EPT_WRITABLE_MASK |
3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3304 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3305 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3306 VMX_EPT_EXECUTABLE_MASK);
3307 kvm_enable_tdp();
3308 } else
3309 kvm_disable_tdp();
3308 3310
3309 if (bypass_guest_pf) 3311 if (bypass_guest_pf)
3310 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 3312 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9f1cdb011cff..0d682fc6aeb3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -883,6 +883,7 @@ int kvm_dev_ioctl_check_extension(long ext)
883 case KVM_CAP_PIT: 883 case KVM_CAP_PIT:
884 case KVM_CAP_NOP_IO_DELAY: 884 case KVM_CAP_NOP_IO_DELAY:
885 case KVM_CAP_MP_STATE: 885 case KVM_CAP_MP_STATE:
886 case KVM_CAP_SYNC_MMU:
886 r = 1; 887 r = 1;
887 break; 888 break;
888 case KVM_CAP_COALESCED_MMIO: 889 case KVM_CAP_COALESCED_MMIO:
@@ -1495,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1495 goto out; 1496 goto out;
1496 1497
1497 down_write(&kvm->slots_lock); 1498 down_write(&kvm->slots_lock);
1499 spin_lock(&kvm->mmu_lock);
1498 1500
1499 p = &kvm->arch.aliases[alias->slot]; 1501 p = &kvm->arch.aliases[alias->slot];
1500 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1502 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1506,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1506 break; 1508 break;
1507 kvm->arch.naliases = n; 1509 kvm->arch.naliases = n;
1508 1510
1511 spin_unlock(&kvm->mmu_lock);
1509 kvm_mmu_zap_all(kvm); 1512 kvm_mmu_zap_all(kvm);
1510 1513
1511 up_write(&kvm->slots_lock); 1514 up_write(&kvm->slots_lock);
@@ -3184,6 +3187,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3184 kvm_desct->base |= seg_desc->base2 << 24; 3187 kvm_desct->base |= seg_desc->base2 << 24;
3185 kvm_desct->limit = seg_desc->limit0; 3188 kvm_desct->limit = seg_desc->limit0;
3186 kvm_desct->limit |= seg_desc->limit << 16; 3189 kvm_desct->limit |= seg_desc->limit << 16;
3190 if (seg_desc->g) {
3191 kvm_desct->limit <<= 12;
3192 kvm_desct->limit |= 0xfff;
3193 }
3187 kvm_desct->selector = selector; 3194 kvm_desct->selector = selector;
3188 kvm_desct->type = seg_desc->type; 3195 kvm_desct->type = seg_desc->type;
3189 kvm_desct->present = seg_desc->p; 3196 kvm_desct->present = seg_desc->p;
@@ -3223,6 +3230,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3223static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3230static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3224 struct desc_struct *seg_desc) 3231 struct desc_struct *seg_desc)
3225{ 3232{
3233 gpa_t gpa;
3226 struct descriptor_table dtable; 3234 struct descriptor_table dtable;
3227 u16 index = selector >> 3; 3235 u16 index = selector >> 3;
3228 3236
@@ -3232,13 +3240,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3232 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3240 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3233 return 1; 3241 return 1;
3234 } 3242 }
3235 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3243 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3244 gpa += index * 8;
3245 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3236} 3246}
3237 3247
3238/* allowed just for 8 bytes segments */ 3248/* allowed just for 8 bytes segments */
3239static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3249static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3240 struct desc_struct *seg_desc) 3250 struct desc_struct *seg_desc)
3241{ 3251{
3252 gpa_t gpa;
3242 struct descriptor_table dtable; 3253 struct descriptor_table dtable;
3243 u16 index = selector >> 3; 3254 u16 index = selector >> 3;
3244 3255
@@ -3246,7 +3257,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3246 3257
3247 if (dtable.limit < index * 8 + 7) 3258 if (dtable.limit < index * 8 + 7)
3248 return 1; 3259 return 1;
3249 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3260 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3261 gpa += index * 8;
3262 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3250} 3263}
3251 3264
3252static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3265static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3258,55 +3271,7 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3258 base_addr |= (seg_desc->base1 << 16); 3271 base_addr |= (seg_desc->base1 << 16);
3259 base_addr |= (seg_desc->base2 << 24); 3272 base_addr |= (seg_desc->base2 << 24);
3260 3273
3261 return base_addr; 3274 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3262}
3263
3264static int load_tss_segment32(struct kvm_vcpu *vcpu,
3265 struct desc_struct *seg_desc,
3266 struct tss_segment_32 *tss)
3267{
3268 u32 base_addr;
3269
3270 base_addr = get_tss_base_addr(vcpu, seg_desc);
3271
3272 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3273 sizeof(struct tss_segment_32));
3274}
3275
3276static int save_tss_segment32(struct kvm_vcpu *vcpu,
3277 struct desc_struct *seg_desc,
3278 struct tss_segment_32 *tss)
3279{
3280 u32 base_addr;
3281
3282 base_addr = get_tss_base_addr(vcpu, seg_desc);
3283
3284 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3285 sizeof(struct tss_segment_32));
3286}
3287
3288static int load_tss_segment16(struct kvm_vcpu *vcpu,
3289 struct desc_struct *seg_desc,
3290 struct tss_segment_16 *tss)
3291{
3292 u32 base_addr;
3293
3294 base_addr = get_tss_base_addr(vcpu, seg_desc);
3295
3296 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3297 sizeof(struct tss_segment_16));
3298}
3299
3300static int save_tss_segment16(struct kvm_vcpu *vcpu,
3301 struct desc_struct *seg_desc,
3302 struct tss_segment_16 *tss)
3303{
3304 u32 base_addr;
3305
3306 base_addr = get_tss_base_addr(vcpu, seg_desc);
3307
3308 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3309 sizeof(struct tss_segment_16));
3310} 3275}
3311 3276
3312static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3277static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -3466,20 +3431,26 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3466} 3431}
3467 3432
3468static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3433static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3469 struct desc_struct *cseg_desc, 3434 u32 old_tss_base,
3470 struct desc_struct *nseg_desc) 3435 struct desc_struct *nseg_desc)
3471{ 3436{
3472 struct tss_segment_16 tss_segment_16; 3437 struct tss_segment_16 tss_segment_16;
3473 int ret = 0; 3438 int ret = 0;
3474 3439
3475 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3440 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3441 sizeof tss_segment_16))
3476 goto out; 3442 goto out;
3477 3443
3478 save_state_to_tss16(vcpu, &tss_segment_16); 3444 save_state_to_tss16(vcpu, &tss_segment_16);
3479 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3480 3445
3481 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3446 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3447 sizeof tss_segment_16))
3448 goto out;
3449
3450 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3451 &tss_segment_16, sizeof tss_segment_16))
3482 goto out; 3452 goto out;
3453
3483 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3454 if (load_state_from_tss16(vcpu, &tss_segment_16))
3484 goto out; 3455 goto out;
3485 3456
@@ -3489,20 +3460,26 @@ out:
3489} 3460}
3490 3461
3491static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3462static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3492 struct desc_struct *cseg_desc, 3463 u32 old_tss_base,
3493 struct desc_struct *nseg_desc) 3464 struct desc_struct *nseg_desc)
3494{ 3465{
3495 struct tss_segment_32 tss_segment_32; 3466 struct tss_segment_32 tss_segment_32;
3496 int ret = 0; 3467 int ret = 0;
3497 3468
3498 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3469 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3470 sizeof tss_segment_32))
3499 goto out; 3471 goto out;
3500 3472
3501 save_state_to_tss32(vcpu, &tss_segment_32); 3473 save_state_to_tss32(vcpu, &tss_segment_32);
3502 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3503 3474
3504 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3475 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3476 sizeof tss_segment_32))
3505 goto out; 3477 goto out;
3478
3479 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3480 &tss_segment_32, sizeof tss_segment_32))
3481 goto out;
3482
3506 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3483 if (load_state_from_tss32(vcpu, &tss_segment_32))
3507 goto out; 3484 goto out;
3508 3485
@@ -3517,16 +3494,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3517 struct desc_struct cseg_desc; 3494 struct desc_struct cseg_desc;
3518 struct desc_struct nseg_desc; 3495 struct desc_struct nseg_desc;
3519 int ret = 0; 3496 int ret = 0;
3497 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3498 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3520 3499
3521 kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3500 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3522 3501
3502 /* FIXME: Handle errors. Failure to read either TSS or their
3503 * descriptors should generate a pagefault.
3504 */
3523 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3505 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3524 goto out; 3506 goto out;
3525 3507
3526 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3508 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3527 goto out; 3509 goto out;
3528 3510
3529
3530 if (reason != TASK_SWITCH_IRET) { 3511 if (reason != TASK_SWITCH_IRET) {
3531 int cpl; 3512 int cpl;
3532 3513
@@ -3544,8 +3525,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3544 3525
3545 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3526 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3546 cseg_desc.type &= ~(1 << 1); //clear the B flag 3527 cseg_desc.type &= ~(1 << 1); //clear the B flag
3547 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3528 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3548 &cseg_desc);
3549 } 3529 }
3550 3530
3551 if (reason == TASK_SWITCH_IRET) { 3531 if (reason == TASK_SWITCH_IRET) {
@@ -3557,10 +3537,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3557 kvm_x86_ops->cache_regs(vcpu); 3537 kvm_x86_ops->cache_regs(vcpu);
3558 3538
3559 if (nseg_desc.type & 8) 3539 if (nseg_desc.type & 8)
3560 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3561 &nseg_desc); 3541 &nseg_desc);
3562 else 3542 else
3563 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3543 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3564 &nseg_desc); 3544 &nseg_desc);
3565 3545
3566 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3546 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3995,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3995 */ 3975 */
3996 if (!user_alloc) { 3976 if (!user_alloc) {
3997 if (npages && !old.rmap) { 3977 if (npages && !old.rmap) {
3978 unsigned long userspace_addr;
3979
3998 down_write(&current->mm->mmap_sem); 3980 down_write(&current->mm->mmap_sem);
3999 memslot->userspace_addr = do_mmap(NULL, 0, 3981 userspace_addr = do_mmap(NULL, 0,
4000 npages * PAGE_SIZE, 3982 npages * PAGE_SIZE,
4001 PROT_READ | PROT_WRITE, 3983 PROT_READ | PROT_WRITE,
4002 MAP_SHARED | MAP_ANONYMOUS, 3984 MAP_SHARED | MAP_ANONYMOUS,
4003 0); 3985 0);
4004 up_write(&current->mm->mmap_sem); 3986 up_write(&current->mm->mmap_sem);
4005 3987
4006 if (IS_ERR((void *)memslot->userspace_addr)) 3988 if (IS_ERR((void *)userspace_addr))
4007 return PTR_ERR((void *)memslot->userspace_addr); 3989 return PTR_ERR((void *)userspace_addr);
3990
3991 /* set userspace_addr atomically for kvm_hva_to_rmapp */
3992 spin_lock(&kvm->mmu_lock);
3993 memslot->userspace_addr = userspace_addr;
3994 spin_unlock(&kvm->mmu_lock);
4008 } else { 3995 } else {
4009 if (!old.user_alloc && old.rmap) { 3996 if (!old.user_alloc && old.rmap) {
4010 int ret; 3997 int ret;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 0313a5eec412..d9249a882aa5 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1014,6 +1014,9 @@ __init void lguest_init(void)
1014 init_pg_tables_start = __pa(pg0); 1014 init_pg_tables_start = __pa(pg0);
1015 init_pg_tables_end = __pa(pg0); 1015 init_pg_tables_end = __pa(pg0);
1016 1016
1017 /* As described in head_32.S, we map the first 128M of memory. */
1018 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1019
1017 /* Load the %fs segment register (the per-cpu segment register) with 1020 /* Load the %fs segment register (the per-cpu segment register) with
1018 * the normal data segment to get through booting. */ 1021 * the normal data segment to get through booting. */
1019 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1022 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dfdf428975c0..f118c110af32 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -52,7 +52,7 @@
52 jnz 100b 52 jnz 100b
53102: 53102:
54 .section .fixup,"ax" 54 .section .fixup,"ax"
55103: addl %r8d,%edx /* ecx is zerorest also */ 55103: addl %ecx,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail 56 jmp copy_user_handle_tail
57 .previous 57 .previous
58 58
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 40e0e309d27e..cb0c112386fb 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -32,7 +32,7 @@
32 jnz 100b 32 jnz 100b
33102: 33102:
34 .section .fixup,"ax" 34 .section .fixup,"ax"
35103: addl %r8d,%edx /* ecx is zerorest also */ 35103: addl %ecx,%edx /* ecx is zerorest also */
36 jmp copy_user_handle_tail 36 jmp copy_user_handle_tail
37 .previous 37 .previous
38 38
@@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache)
108 jmp 60f 108 jmp 60f
10950: movl %ecx,%edx 10950: movl %ecx,%edx
11060: sfence 11060: sfence
111 movl %r8d,%ecx
112 jmp copy_user_handle_tail 111 jmp copy_user_handle_tail
113 .previous 112 .previous
114 113
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
index 4354ce804889..50189af14b85 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -130,10 +130,10 @@ parse_unisys_oem (char *oemptr)
130 mip_addr = val; 130 mip_addr = val;
131 mip = (struct mip_reg *)val; 131 mip = (struct mip_reg *)val;
132 mip_reg = __va(mip); 132 mip_reg = __va(mip);
133 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", 133 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
134 (unsigned long)host_reg); 134 (unsigned long)host_reg);
135 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", 135 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
136 (unsigned long)mip_reg); 136 (unsigned long)mip_reg);
137 success++; 137 success++;
138 break; 138 break;
139 case MIP_PSAI_REG: 139 case MIP_PSAI_REG:
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7a..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 5dfef9fa061a..62fa440678d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -42,7 +42,6 @@
42 42
43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
45static bootmem_data_t node0_bdata;
46 45
47/* 46/*
48 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn,
385 for_each_online_node(nid) 384 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387 386
388 NODE_DATA(0)->bdata = &node0_bdata; 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
389 setup_bootmem_allocator(); 388 setup_bootmem_allocator();
390} 389}
391 390
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0bb0caed8971..a20d1fa64b4e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec37121f6709..129618ca0ea2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -86,43 +86,6 @@ early_param("gbpages", parse_direct_gbpages_on);
86 * around without checking the pgd every time. 86 * around without checking the pgd every time.
87 */ 87 */
88 88
89void show_mem(void)
90{
91 long i, total = 0, reserved = 0;
92 long shared = 0, cached = 0;
93 struct page *page;
94 pg_data_t *pgdat;
95
96 printk(KERN_INFO "Mem-info:\n");
97 show_free_areas();
98 for_each_online_pgdat(pgdat) {
99 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
100 /*
101 * This loop can take a while with 256 GB and
102 * 4k pages so defer the NMI watchdog:
103 */
104 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
105 touch_nmi_watchdog();
106
107 if (!pfn_valid(pgdat->node_start_pfn + i))
108 continue;
109
110 page = pfn_to_page(pgdat->node_start_pfn + i);
111 total++;
112 if (PageReserved(page))
113 reserved++;
114 else if (PageSwapCache(page))
115 cached++;
116 else if (page_count(page))
117 shared += page_count(page) - 1;
118 }
119 }
120 printk(KERN_INFO "%lu pages of RAM\n", total);
121 printk(KERN_INFO "%lu reserved pages\n", reserved);
122 printk(KERN_INFO "%lu pages shared\n", shared);
123 printk(KERN_INFO "%lu pages swap cached\n", cached);
124}
125
126int after_bootmem; 89int after_bootmem;
127 90
128static __init void *spp_getpage(void) 91static __init void *spp_getpage(void)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 19fd9a3c5210..fba57be9af68 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
330 return (void __iomem *)ret; 330 return (void __iomem *)ret;
331} 331}
332 332
333void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
334 unsigned long prot_val)
335{
336 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
337 __builtin_return_address(0));
338}
339EXPORT_SYMBOL(ioremap_prot);
340
333/** 341/**
334 * iounmap - Free a IO remapping 342 * iounmap - Free a IO remapping
335 * @addr: virtual address from ioremap_* 343 * @addr: virtual address from ioremap_*
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9782f42dd319..a4dd793d6003 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -23,8 +23,6 @@
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
25 25
26static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28struct memnode memnode; 26struct memnode memnode;
29 27
30s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
@@ -198,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
198 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
199 197
200 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
201 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
202 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
203 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
204 202
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 557b2abceef8..d50302774fe2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -207,6 +207,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
207 unsigned long addr; 207 unsigned long addr;
208 int i; 208 int i;
209 209
210 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
211 return;
212
210 pud = pud_offset(pgd, 0); 213 pud = pud_offset(pgd, 0);
211 214
212 for (addr = i = 0; i < PREALLOCATED_PMDS; 215 for (addr = i = 0; i < PREALLOCATED_PMDS;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c570..cab0abbd1ebe 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7f3329b55d2e..3f90289410e6 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -369,20 +369,34 @@ static int __init ppro_init(char **cpu_type)
369{ 369{
370 __u8 cpu_model = boot_cpu_data.x86_model; 370 __u8 cpu_model = boot_cpu_data.x86_model;
371 371
372 if (cpu_model == 14) 372 switch (cpu_model) {
373 case 0 ... 2:
374 *cpu_type = "i386/ppro";
375 break;
376 case 3 ... 5:
377 *cpu_type = "i386/pii";
378 break;
379 case 6 ... 8:
380 *cpu_type = "i386/piii";
381 break;
382 case 9:
383 *cpu_type = "i386/p6_mobile";
384 break;
385 case 10 ... 13:
386 *cpu_type = "i386/p6";
387 break;
388 case 14:
373 *cpu_type = "i386/core"; 389 *cpu_type = "i386/core";
374 else if (cpu_model == 15 || cpu_model == 23) 390 break;
391 case 15: case 23:
392 *cpu_type = "i386/core_2";
393 break;
394 case 26:
375 *cpu_type = "i386/core_2"; 395 *cpu_type = "i386/core_2";
376 else if (cpu_model > 0xd) 396 break;
397 default:
398 /* Unknown */
377 return 0; 399 return 0;
378 else if (cpu_model == 9) {
379 *cpu_type = "i386/p6_mobile";
380 } else if (cpu_model > 5) {
381 *cpu_type = "i386/piii";
382 } else if (cpu_model > 2) {
383 *cpu_type = "i386/pii";
384 } else {
385 *cpu_type = "i386/ppro";
386 } 400 }
387 401
388 model = &op_ppro_spec; 402 model = &op_ppro_spec;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ff3a6a336342..4bdaa590375d 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
26 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 26 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
27 suba, subb);
27 if (busno) 28 if (busno)
28 pci_scan_bus_with_sysdata(busno); /* Bus A */ 29 pci_scan_bus_with_sysdata(busno); /* Bus A */
29 if (suba < subb) 30 if (suba < subb)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 2aafb67dc5f1..5807d1bc73f7 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -128,10 +128,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
128 pr = pci_find_parent_resource(dev, r); 128 pr = pci_find_parent_resource(dev, r);
129 if (!r->start || !pr || 129 if (!r->start || !pr ||
130 request_resource(pr, r) < 0) { 130 request_resource(pr, r) < 0) {
131 printk(KERN_ERR "PCI: Cannot allocate " 131 dev_err(&dev->dev, "BAR %d: can't "
132 "resource region %d " 132 "allocate resource\n", idx);
133 "of bridge %s\n",
134 idx, pci_name(dev));
135 /* 133 /*
136 * Something is wrong with the region. 134 * Something is wrong with the region.
137 * Invalidate the resource to prevent 135 * Invalidate the resource to prevent
@@ -166,15 +164,15 @@ static void __init pcibios_allocate_resources(int pass)
166 else 164 else
167 disabled = !(command & PCI_COMMAND_MEMORY); 165 disabled = !(command & PCI_COMMAND_MEMORY);
168 if (pass == disabled) { 166 if (pass == disabled) {
169 DBG("PCI: Resource %08lx-%08lx " 167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx "
170 "(f=%lx, d=%d, p=%d)\n", 168 "(f=%lx, d=%d, p=%d)\n",
171 r->start, r->end, r->flags, disabled, pass); 169 (unsigned long long) r->start,
170 (unsigned long long) r->end,
171 r->flags, disabled, pass);
172 pr = pci_find_parent_resource(dev, r); 172 pr = pci_find_parent_resource(dev, r);
173 if (!pr || request_resource(pr, r) < 0) { 173 if (!pr || request_resource(pr, r) < 0) {
174 printk(KERN_ERR "PCI: Cannot allocate " 174 dev_err(&dev->dev, "BAR %d: can't "
175 "resource region %d " 175 "allocate resource\n", idx);
176 "of device %s\n",
177 idx, pci_name(dev));
178 /* We'll assign a new address later */ 176 /* We'll assign a new address later */
179 r->end -= r->start; 177 r->end -= r->start;
180 r->start = 0; 178 r->start = 0;
@@ -187,8 +185,7 @@ static void __init pcibios_allocate_resources(int pass)
187 /* Turn the ROM off, leave the resource region, 185 /* Turn the ROM off, leave the resource region,
188 * but keep it unregistered. */ 186 * but keep it unregistered. */
189 u32 reg; 187 u32 reg;
190 DBG("PCI: Switching off ROM of %s\n", 188 dev_dbg(&dev->dev, "disabling ROM\n");
191 pci_name(dev));
192 r->flags &= ~IORESOURCE_ROM_ENABLE; 189 r->flags &= ~IORESOURCE_ROM_ENABLE;
193 pci_read_config_dword(dev, 190 pci_read_config_dword(dev,
194 dev->rom_base_reg, &reg); 191 dev->rom_base_reg, &reg);
@@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev)
257 lat = pcibios_max_latency; 254 lat = pcibios_max_latency;
258 else 255 else
259 return; 256 return;
260 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", 257 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
261 pci_name(dev), lat);
262 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 258 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
263} 259}
264 260
@@ -280,6 +276,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma)
280static struct vm_operations_struct pci_mmap_ops = { 276static struct vm_operations_struct pci_mmap_ops = {
281 .open = pci_track_mmap_page_range, 277 .open = pci_track_mmap_page_range,
282 .close = pci_unmap_page_range, 278 .close = pci_unmap_page_range,
279 .access = generic_access_phys,
283}; 280};
284 281
285int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 282int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 6a06a2eb0597..fec0123b33a9 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
436{ 436{
437 WARN_ON_ONCE(pirq >= 9); 437 WARN_ON_ONCE(pirq >= 9);
438 if (pirq > 8) { 438 if (pirq > 8) {
439 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 439 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
440 return 0; 440 return 0;
441 } 441 }
442 return read_config_nybble(router, 0x74, pirq-1); 442 return read_config_nybble(router, 0x74, pirq-1);
@@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
446{ 446{
447 WARN_ON_ONCE(pirq >= 9); 447 WARN_ON_ONCE(pirq >= 9);
448 if (pirq > 8) { 448 if (pirq > 8) {
449 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 449 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
450 return 0; 450 return 0;
451 } 451 }
452 write_config_nybble(router, 0x74, pirq-1, irq); 452 write_config_nybble(router, 0x74, pirq-1, irq);
@@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
492 irq = 0; 492 irq = 0;
493 if (pirq <= 4) 493 if (pirq <= 4)
494 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
495 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", 495 dev_info(&dev->dev,
496 dev->vendor, dev->device, pirq, irq); 496 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
497 dev->vendor, dev->device, pirq, irq);
497 return irq; 498 return irq;
498} 499}
499 500
500static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
501{ 502{
502 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 503 dev_info(&dev->dev,
503 dev->vendor, dev->device, pirq, irq); 504 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq);
504 if (pirq <= 4) 506 if (pirq <= 4)
505 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
506 return 1; 508 return 1;
@@ -730,7 +732,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
730 switch (device) { 732 switch (device) {
731 case PCI_DEVICE_ID_AL_M1533: 733 case PCI_DEVICE_ID_AL_M1533:
732 case PCI_DEVICE_ID_AL_M1563: 734 case PCI_DEVICE_ID_AL_M1563:
733 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
734 r->name = "ALI"; 735 r->name = "ALI";
735 r->get = pirq_ali_get; 736 r->get = pirq_ali_get;
736 r->set = pirq_ali_set; 737 r->set = pirq_ali_set;
@@ -840,11 +841,9 @@ static void __init pirq_find_router(struct irq_router *r)
840 h->probe(r, pirq_router_dev, pirq_router_dev->device)) 841 h->probe(r, pirq_router_dev, pirq_router_dev->device))
841 break; 842 break;
842 } 843 }
843 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 844 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
844 pirq_router.name, 845 pirq_router.name,
845 pirq_router_dev->vendor, 846 pirq_router_dev->vendor, pirq_router_dev->device);
846 pirq_router_dev->device,
847 pci_name(pirq_router_dev));
848 847
849 /* The device remains referenced for the kernel lifetime */ 848 /* The device remains referenced for the kernel lifetime */
850} 849}
@@ -877,7 +876,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
877 /* Find IRQ pin */ 876 /* Find IRQ pin */
878 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 877 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
879 if (!pin) { 878 if (!pin) {
880 DBG(KERN_DEBUG " -> no interrupt pin\n"); 879 dev_dbg(&dev->dev, "no interrupt pin\n");
881 return 0; 880 return 0;
882 } 881 }
883 pin = pin - 1; 882 pin = pin - 1;
@@ -887,20 +886,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
887 if (!pirq_table) 886 if (!pirq_table)
888 return 0; 887 return 0;
889 888
890 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
891 info = pirq_get_info(dev); 889 info = pirq_get_info(dev);
892 if (!info) { 890 if (!info) {
893 DBG(" -> not found in routing table\n" KERN_DEBUG); 891 dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
892 'A' + pin);
894 return 0; 893 return 0;
895 } 894 }
896 pirq = info->irq[pin].link; 895 pirq = info->irq[pin].link;
897 mask = info->irq[pin].bitmap; 896 mask = info->irq[pin].bitmap;
898 if (!pirq) { 897 if (!pirq) {
899 DBG(" -> not routed\n" KERN_DEBUG); 898 dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
900 return 0; 899 return 0;
901 } 900 }
902 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, 901 dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
903 pirq_table->exclusive_irqs); 902 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
904 mask &= pcibios_irq_mask; 903 mask &= pcibios_irq_mask;
905 904
906 /* Work around broken HP Pavilion Notebooks which assign USB to 905 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -930,10 +929,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
930 if (pci_probe & PCI_USE_PIRQ_MASK) 929 if (pci_probe & PCI_USE_PIRQ_MASK)
931 newirq = 0; 930 newirq = 0;
932 else 931 else
933 printk("\n" KERN_WARNING 932 dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
934 "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" 933 "%#x; try pci=usepirqmask\n", newirq, mask);
935 KERN_DEBUG, newirq,
936 pci_name(dev));
937 } 934 }
938 if (!newirq && assign) { 935 if (!newirq && assign) {
939 for (i = 0; i < 16; i++) { 936 for (i = 0; i < 16; i++) {
@@ -944,39 +941,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
944 newirq = i; 941 newirq = i;
945 } 942 }
946 } 943 }
947 DBG(" -> newirq=%d", newirq); 944 dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
948 945
949 /* Check if it is hardcoded */ 946 /* Check if it is hardcoded */
950 if ((pirq & 0xf0) == 0xf0) { 947 if ((pirq & 0xf0) == 0xf0) {
951 irq = pirq & 0xf; 948 irq = pirq & 0xf;
952 DBG(" -> hardcoded IRQ %d\n", irq); 949 msg = "hardcoded";
953 msg = "Hardcoded";
954 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ 950 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
955 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { 951 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
956 DBG(" -> got IRQ %d\n", irq); 952 msg = "found";
957 msg = "Found";
958 eisa_set_level_irq(irq); 953 eisa_set_level_irq(irq);
959 } else if (newirq && r->set && 954 } else if (newirq && r->set &&
960 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 955 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
961 DBG(" -> assigning IRQ %d", newirq);
962 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 956 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
963 eisa_set_level_irq(newirq); 957 eisa_set_level_irq(newirq);
964 DBG(" ... OK\n"); 958 msg = "assigned";
965 msg = "Assigned";
966 irq = newirq; 959 irq = newirq;
967 } 960 }
968 } 961 }
969 962
970 if (!irq) { 963 if (!irq) {
971 DBG(" ... failed\n");
972 if (newirq && mask == (1 << newirq)) { 964 if (newirq && mask == (1 << newirq)) {
973 msg = "Guessed"; 965 msg = "guessed";
974 irq = newirq; 966 irq = newirq;
975 } else 967 } else {
968 dev_dbg(&dev->dev, "can't route interrupt\n");
976 return 0; 969 return 0;
970 }
977 } 971 }
978 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, 972 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
979 pci_name(dev));
980 973
981 /* Update IRQ for all devices with the same pirq value */ 974 /* Update IRQ for all devices with the same pirq value */
982 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 975 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -996,17 +989,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
996 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 989 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
997 ((1 << dev2->irq) & mask))) { 990 ((1 << dev2->irq) & mask))) {
998#ifndef CONFIG_PCI_MSI 991#ifndef CONFIG_PCI_MSI
999 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", 992 dev_info(&dev2->dev, "IRQ routing conflict: "
1000 pci_name(dev2), dev2->irq, irq); 993 "have IRQ %d, want IRQ %d\n",
994 dev2->irq, irq);
1001#endif 995#endif
1002 continue; 996 continue;
1003 } 997 }
1004 dev2->irq = irq; 998 dev2->irq = irq;
1005 pirq_penalty[irq]++; 999 pirq_penalty[irq]++;
1006 if (dev != dev2) 1000 if (dev != dev2)
1007 printk(KERN_INFO 1001 dev_info(&dev->dev, "sharing IRQ %d with %s\n",
1008 "PCI: Sharing IRQ %d with %s\n", 1002 irq, pci_name(dev2));
1009 irq, pci_name(dev2));
1010 } 1003 }
1011 } 1004 }
1012 return 1; 1005 return 1;
@@ -1025,8 +1018,7 @@ static void __init pcibios_fixup_irqs(void)
1025 * already in use. 1018 * already in use.
1026 */ 1019 */
1027 if (dev->irq >= 16) { 1020 if (dev->irq >= 16) {
1028 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", 1021 dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
1029 pci_name(dev), dev->irq);
1030 dev->irq = 0; 1022 dev->irq = 0;
1031 } 1023 }
1032 /* 1024 /*
@@ -1070,12 +1062,12 @@ static void __init pcibios_fixup_irqs(void)
1070 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1062 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1071 PCI_SLOT(bridge->devfn), pin); 1063 PCI_SLOT(bridge->devfn), pin);
1072 if (irq >= 0) 1064 if (irq >= 0)
1073 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1065 dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
1074 pci_name(bridge), 'A' + pin, irq); 1066 pci_name(bridge),
1067 'A' + pin, irq);
1075 } 1068 }
1076 if (irq >= 0) { 1069 if (irq >= 0) {
1077 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1070 dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
1078 pci_name(dev), 'A' + pin, irq);
1079 dev->irq = irq; 1071 dev->irq = irq;
1080 } 1072 }
1081 } 1073 }
@@ -1231,25 +1223,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
1231 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1223 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1232 PCI_SLOT(bridge->devfn), pin); 1224 PCI_SLOT(bridge->devfn), pin);
1233 if (irq >= 0) 1225 if (irq >= 0)
1234 printk(KERN_WARNING 1226 dev_warn(&dev->dev, "using bridge %s "
1235 "PCI: using PPB %s[%c] to get irq %d\n", 1227 "INT %c to get IRQ %d\n",
1236 pci_name(bridge), 1228 pci_name(bridge), 'A' + pin,
1237 'A' + pin, irq); 1229 irq);
1238 dev = bridge; 1230 dev = bridge;
1239 } 1231 }
1240 dev = temp_dev; 1232 dev = temp_dev;
1241 if (irq >= 0) { 1233 if (irq >= 0) {
1242 printk(KERN_INFO 1234 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1243 "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1235 "INT %c -> IRQ %d\n", 'A' + pin, irq);
1244 pci_name(dev), 'A' + pin, irq);
1245 dev->irq = irq; 1236 dev->irq = irq;
1246 return 0; 1237 return 0;
1247 } else 1238 } else
1248 msg = " Probably buggy MP table."; 1239 msg = "; probably buggy MP table";
1249 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1240 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1250 msg = ""; 1241 msg = "";
1251 else 1242 else
1252 msg = " Please try using pci=biosirq."; 1243 msg = "; please try using pci=biosirq";
1253 1244
1254 /* 1245 /*
1255 * With IDE legacy devices the IRQ lookup failure is not 1246 * With IDE legacy devices the IRQ lookup failure is not
@@ -1259,9 +1250,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1259 !(dev->class & 0x5)) 1250 !(dev->class & 0x5))
1260 return 0; 1251 return 0;
1261 1252
1262 printk(KERN_WARNING 1253 dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
1263 "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1254 'A' + pin, msg);
1264 'A' + pin, pci_name(dev), msg);
1265 } 1255 }
1266 return 0; 1256 return 0;
1267} 1257}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index f4b16dc11dad..1177845d3186 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
131 u8 busno, suba, subb; 131 u8 busno, suba, subb;
132 int quad = BUS2QUAD(d->bus->number); 132 int quad = BUS2QUAD(d->bus->number);
133 133
134 printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 134 dev_info(&d->dev, "searching for i450NX host bridges\n");
135 reg = 0xd0; 135 reg = 0xd0;
136 for(pxb=0; pxb<2; pxb++) { 136 for(pxb=0; pxb<2; pxb++) {
137 pci_read_config_byte(d, reg++, &busno); 137 pci_read_config_byte(d, reg++, &busno);
138 pci_read_config_byte(d, reg++, &suba); 138 pci_read_config_byte(d, reg++, &suba);
139 pci_read_config_byte(d, reg++, &subb); 139 pci_read_config_byte(d, reg++, &subb);
140 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 140 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
141 pxb, busno, suba, subb);
141 if (busno) { 142 if (busno) {
142 /* Bus A */ 143 /* Bus A */
143 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); 144 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 194bbd6e3241..9ff6e3cbf08f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1435,7 +1435,7 @@ static unsigned long m2p(phys_addr_t maddr)
1435{ 1435{
1436 phys_addr_t paddr; 1436 phys_addr_t paddr;
1437 1437
1438 maddr &= PTE_MASK; 1438 maddr &= PTE_PFN_MASK;
1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; 1439 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1440 1440
1441 return paddr; 1441 return paddr;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a44d56e38bd1..aa37469da696 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -343,8 +343,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
343static pteval_t pte_mfn_to_pfn(pteval_t val) 343static pteval_t pte_mfn_to_pfn(pteval_t val)
344{ 344{
345 if (val & _PAGE_PRESENT) { 345 if (val & _PAGE_PRESENT) {
346 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 346 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
347 pteval_t flags = val & ~PTE_MASK; 347 pteval_t flags = val & PTE_FLAGS_MASK;
348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 348 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
349 } 349 }
350 350
@@ -354,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
354static pteval_t pte_pfn_to_mfn(pteval_t val) 354static pteval_t pte_pfn_to_mfn(pteval_t val)
355{ 355{
356 if (val & _PAGE_PRESENT) { 356 if (val & _PAGE_PRESENT) {
357 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 357 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
358 pteval_t flags = val & ~PTE_MASK; 358 pteval_t flags = val & PTE_FLAGS_MASK;
359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 359 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
360 } 360 }
361 361
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index e693812ac59a..d8faf79a0a1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -367,7 +367,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
367 367
368 cpus_and(mask, mask, cpu_online_map); 368 cpus_and(mask, mask, cpu_online_map);
369 369
370 for_each_cpu_mask(cpu, mask) 370 for_each_cpu_mask_nr(cpu, mask)
371 xen_send_IPI_one(cpu, vector); 371 xen_send_IPI_one(cpu, vector);
372} 372}
373 373
@@ -378,7 +378,7 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
378 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 378 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
379 379
380 /* Make sure other vcpus get a chance to run if they need to. */ 380 /* Make sure other vcpus get a chance to run if they need to. */
381 for_each_cpu_mask(cpu, mask) { 381 for_each_cpu_mask_nr(cpu, mask) {
382 if (xen_vcpu_stolen(cpu)) { 382 if (xen_vcpu_stolen(cpu)) {
383 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 383 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
384 break; 384 break;
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 4038cbfe3331..7f58304fafb3 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -173,7 +173,7 @@ ENTRY(xen_sysexit)
173 pushq $__USER32_CS 173 pushq $__USER32_CS
174 pushq %rdx 174 pushq %rdx
175 175
176 pushq $VGCF_in_syscall 176 pushq $0
1771: jmp hypercall_iret 1771: jmp hypercall_iret
178ENDPATCH(xen_sysexit) 178ENDPATCH(xen_sysexit)
179RELOC(xen_sysexit, 1b+1) 179RELOC(xen_sysexit, 1b+1)