diff options
Diffstat (limited to 'arch/x86')
162 files changed, 3181 insertions, 2139 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52421d52f21e..f46f30d23eb0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -24,18 +24,21 @@ config X86 | |||
24 | select HAVE_UNSTABLE_SCHED_CLOCK | 24 | select HAVE_UNSTABLE_SCHED_CLOCK |
25 | select HAVE_IDE | 25 | select HAVE_IDE |
26 | select HAVE_OPROFILE | 26 | select HAVE_OPROFILE |
27 | select HAVE_PERF_COUNTERS if (!M386 && !M486) | ||
27 | select HAVE_IOREMAP_PROT | 28 | select HAVE_IOREMAP_PROT |
28 | select HAVE_KPROBES | 29 | select HAVE_KPROBES |
29 | select ARCH_WANT_OPTIONAL_GPIOLIB | 30 | select ARCH_WANT_OPTIONAL_GPIOLIB |
30 | select ARCH_WANT_FRAME_POINTERS | 31 | select ARCH_WANT_FRAME_POINTERS |
32 | select HAVE_DMA_ATTRS | ||
31 | select HAVE_KRETPROBES | 33 | select HAVE_KRETPROBES |
32 | select HAVE_FTRACE_MCOUNT_RECORD | 34 | select HAVE_FTRACE_MCOUNT_RECORD |
33 | select HAVE_DYNAMIC_FTRACE | 35 | select HAVE_DYNAMIC_FTRACE |
34 | select HAVE_FUNCTION_TRACER | 36 | select HAVE_FUNCTION_TRACER |
35 | select HAVE_FUNCTION_GRAPH_TRACER | 37 | select HAVE_FUNCTION_GRAPH_TRACER |
38 | select HAVE_FUNCTION_GRAPH_FP_TEST | ||
36 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST | 39 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST |
37 | select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE | 40 | select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE |
38 | select HAVE_FTRACE_SYSCALLS | 41 | select HAVE_SYSCALL_TRACEPOINTS |
39 | select HAVE_KVM | 42 | select HAVE_KVM |
40 | select HAVE_ARCH_KGDB | 43 | select HAVE_ARCH_KGDB |
41 | select HAVE_ARCH_TRACEHOOK | 44 | select HAVE_ARCH_TRACEHOOK |
@@ -741,7 +744,6 @@ config X86_UP_IOAPIC | |||
741 | config X86_LOCAL_APIC | 744 | config X86_LOCAL_APIC |
742 | def_bool y | 745 | def_bool y |
743 | depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC | 746 | depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC |
744 | select HAVE_PERF_COUNTERS if (!M386 && !M486) | ||
745 | 747 | ||
746 | config X86_IO_APIC | 748 | config X86_IO_APIC |
747 | def_bool y | 749 | def_bool y |
@@ -1912,25 +1914,26 @@ config DMAR_DEFAULT_ON | |||
1912 | recommended you say N here while the DMAR code remains | 1914 | recommended you say N here while the DMAR code remains |
1913 | experimental. | 1915 | experimental. |
1914 | 1916 | ||
1915 | config DMAR_GFX_WA | 1917 | config DMAR_BROKEN_GFX_WA |
1916 | def_bool y | 1918 | def_bool n |
1917 | prompt "Support for Graphics workaround" | 1919 | prompt "Workaround broken graphics drivers (going away soon)" |
1918 | depends on DMAR | 1920 | depends on DMAR |
1919 | ---help--- | 1921 | ---help--- |
1920 | Current Graphics drivers tend to use physical address | 1922 | Current Graphics drivers tend to use physical address |
1921 | for DMA and avoid using DMA APIs. Setting this config | 1923 | for DMA and avoid using DMA APIs. Setting this config |
1922 | option permits the IOMMU driver to set a unity map for | 1924 | option permits the IOMMU driver to set a unity map for |
1923 | all the OS-visible memory. Hence the driver can continue | 1925 | all the OS-visible memory. Hence the driver can continue |
1924 | to use physical addresses for DMA. | 1926 | to use physical addresses for DMA, at least until this |
1927 | option is removed in the 2.6.32 kernel. | ||
1925 | 1928 | ||
1926 | config DMAR_FLOPPY_WA | 1929 | config DMAR_FLOPPY_WA |
1927 | def_bool y | 1930 | def_bool y |
1928 | depends on DMAR | 1931 | depends on DMAR |
1929 | ---help--- | 1932 | ---help--- |
1930 | Floppy disk drivers are know to bypass DMA API calls | 1933 | Floppy disk drivers are known to bypass DMA API calls |
1931 | thereby failing to work when IOMMU is enabled. This | 1934 | thereby failing to work when IOMMU is enabled. This |
1932 | workaround will setup a 1:1 mapping for the first | 1935 | workaround will setup a 1:1 mapping for the first |
1933 | 16M to make floppy (an ISA device) work. | 1936 | 16MiB to make floppy (an ISA device) work. |
1934 | 1937 | ||
1935 | config INTR_REMAP | 1938 | config INTR_REMAP |
1936 | bool "Support for Interrupt Remapping (EXPERIMENTAL)" | 1939 | bool "Support for Interrupt Remapping (EXPERIMENTAL)" |
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 8d16ada25048..ec749c2bfdd7 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile | |||
@@ -70,6 +70,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ | |||
70 | $(call cc-option, -mpreferred-stack-boundary=2) | 70 | $(call cc-option, -mpreferred-stack-boundary=2) |
71 | KBUILD_CFLAGS += $(call cc-option, -m32) | 71 | KBUILD_CFLAGS += $(call cc-option, -m32) |
72 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ | 72 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ |
73 | GCOV_PROFILE := n | ||
73 | 74 | ||
74 | $(obj)/bzImage: asflags-y := $(SVGA_MODE) | 75 | $(obj)/bzImage: asflags-y := $(SVGA_MODE) |
75 | 76 | ||
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 507793739ea5..1dfbf64e52a2 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S | |||
@@ -13,7 +13,7 @@ | |||
13 | * touching registers they shouldn't be. | 13 | * touching registers they shouldn't be. |
14 | */ | 14 | */ |
15 | 15 | ||
16 | .code16 | 16 | .code16gcc |
17 | .text | 17 | .text |
18 | .globl intcall | 18 | .globl intcall |
19 | .type intcall, @function | 19 | .type intcall, @function |
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 49c8a4c37d7c..f8ed0658404c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile | |||
@@ -4,7 +4,7 @@ | |||
4 | # create a compressed vmlinux image from the original vmlinux | 4 | # create a compressed vmlinux image from the original vmlinux |
5 | # | 5 | # |
6 | 6 | ||
7 | targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o | 7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o |
8 | 8 | ||
9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 | 9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 |
10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC | 10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC |
@@ -15,6 +15,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding) | |||
15 | KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) | 15 | KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) |
16 | 16 | ||
17 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ | 17 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ |
18 | GCOV_PROFILE := n | ||
18 | 19 | ||
19 | LDFLAGS := -m elf_$(UTS_MACHINE) | 20 | LDFLAGS := -m elf_$(UTS_MACHINE) |
20 | LDFLAGS_vmlinux := -T | 21 | LDFLAGS_vmlinux := -T |
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index d660be492363..49e0c18833e0 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c | |||
@@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode) | |||
37 | ireg.al = mode; /* AH=0x00 Set Video Mode */ | 37 | ireg.al = mode; /* AH=0x00 Set Video Mode */ |
38 | intcall(0x10, &ireg, NULL); | 38 | intcall(0x10, &ireg, NULL); |
39 | 39 | ||
40 | |||
41 | ireg.ah = 0x0f; /* Get Current Video Mode */ | 40 | ireg.ah = 0x0f; /* Get Current Video Mode */ |
42 | intcall(0x10, &ireg, &oreg); | 41 | intcall(0x10, &ireg, &oreg); |
43 | 42 | ||
44 | do_restore = 1; /* Assume video contents were lost */ | 43 | do_restore = 1; /* Assume video contents were lost */ |
45 | 44 | ||
46 | /* Not all BIOSes are clean with the top bit */ | 45 | /* Not all BIOSes are clean with the top bit */ |
47 | new_mode = ireg.al & 0x7f; | 46 | new_mode = oreg.al & 0x7f; |
48 | 47 | ||
49 | if (new_mode == mode) | 48 | if (new_mode == mode) |
50 | return 0; /* Mode change OK */ | 49 | return 0; /* Mode change OK */ |
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index c700147d6ffb..275dd177f198 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c | |||
@@ -45,7 +45,7 @@ static int vesa_probe(void) | |||
45 | ireg.di = (size_t)&vginfo; | 45 | ireg.di = (size_t)&vginfo; |
46 | intcall(0x10, &ireg, &oreg); | 46 | intcall(0x10, &ireg, &oreg); |
47 | 47 | ||
48 | if (ireg.ax != 0x004f || | 48 | if (oreg.ax != 0x004f || |
49 | vginfo.signature != VESA_MAGIC || | 49 | vginfo.signature != VESA_MAGIC || |
50 | vginfo.version < 0x0102) | 50 | vginfo.version < 0x0102) |
51 | return 0; /* Not present */ | 51 | return 0; /* Not present */ |
@@ -70,7 +70,7 @@ static int vesa_probe(void) | |||
70 | ireg.di = (size_t)&vminfo; | 70 | ireg.di = (size_t)&vminfo; |
71 | intcall(0x10, &ireg, &oreg); | 71 | intcall(0x10, &ireg, &oreg); |
72 | 72 | ||
73 | if (ireg.ax != 0x004f) | 73 | if (oreg.ax != 0x004f) |
74 | continue; | 74 | continue; |
75 | 75 | ||
76 | if ((vminfo.mode_attr & 0x15) == 0x05) { | 76 | if ((vminfo.mode_attr & 0x15) == 0x05) { |
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index edb992ebef92..d28fad19654a 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig | |||
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y | |||
2355 | CONFIG_HAVE_DYNAMIC_FTRACE=y | 2355 | CONFIG_HAVE_DYNAMIC_FTRACE=y |
2356 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y | 2356 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y |
2357 | CONFIG_HAVE_HW_BRANCH_TRACER=y | 2357 | CONFIG_HAVE_HW_BRANCH_TRACER=y |
2358 | CONFIG_HAVE_FTRACE_SYSCALLS=y | 2358 | CONFIG_HAVE_SYSCALL_TRACEPOINTS=y |
2359 | CONFIG_RING_BUFFER=y | 2359 | CONFIG_RING_BUFFER=y |
2360 | CONFIG_TRACING=y | 2360 | CONFIG_TRACING=y |
2361 | CONFIG_TRACING_SUPPORT=y | 2361 | CONFIG_TRACING_SUPPORT=y |
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cee1dd2e69b2..6c86acd847a4 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig | |||
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y | |||
2329 | CONFIG_HAVE_DYNAMIC_FTRACE=y | 2329 | CONFIG_HAVE_DYNAMIC_FTRACE=y |
2330 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y | 2330 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y |
2331 | CONFIG_HAVE_HW_BRANCH_TRACER=y | 2331 | CONFIG_HAVE_HW_BRANCH_TRACER=y |
2332 | CONFIG_HAVE_FTRACE_SYSCALLS=y | 2332 | CONFIG_HAVE_SYSCALL_TRACEPOINTS=y |
2333 | CONFIG_RING_BUFFER=y | 2333 | CONFIG_RING_BUFFER=y |
2334 | CONFIG_TRACING=y | 2334 | CONFIG_TRACING=y |
2335 | CONFIG_TRACING_SUPPORT=y | 2335 | CONFIG_TRACING_SUPPORT=y |
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index caba99601703..eb0566e83319 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S | |||
@@ -845,7 +845,7 @@ ENTRY(aesni_cbc_enc) | |||
845 | */ | 845 | */ |
846 | ENTRY(aesni_cbc_dec) | 846 | ENTRY(aesni_cbc_dec) |
847 | cmp $16, LEN | 847 | cmp $16, LEN |
848 | jb .Lcbc_dec_ret | 848 | jb .Lcbc_dec_just_ret |
849 | mov 480(KEYP), KLEN | 849 | mov 480(KEYP), KLEN |
850 | add $240, KEYP | 850 | add $240, KEYP |
851 | movups (IVP), IV | 851 | movups (IVP), IV |
@@ -891,6 +891,7 @@ ENTRY(aesni_cbc_dec) | |||
891 | add $16, OUTP | 891 | add $16, OUTP |
892 | cmp $16, LEN | 892 | cmp $16, LEN |
893 | jge .Lcbc_dec_loop1 | 893 | jge .Lcbc_dec_loop1 |
894 | movups IV, (IVP) | ||
895 | .Lcbc_dec_ret: | 894 | .Lcbc_dec_ret: |
895 | movups IV, (IVP) | ||
896 | .Lcbc_dec_just_ret: | ||
896 | ret | 897 | ret |
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 4e663398f77f..c580c5ec1cad 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -198,6 +198,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc, | |||
198 | 198 | ||
199 | blkcipher_walk_init(&walk, dst, src, nbytes); | 199 | blkcipher_walk_init(&walk, dst, src, nbytes); |
200 | err = blkcipher_walk_virt(desc, &walk); | 200 | err = blkcipher_walk_virt(desc, &walk); |
201 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
201 | 202 | ||
202 | kernel_fpu_begin(); | 203 | kernel_fpu_begin(); |
203 | while ((nbytes = walk.nbytes)) { | 204 | while ((nbytes = walk.nbytes)) { |
@@ -221,6 +222,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc, | |||
221 | 222 | ||
222 | blkcipher_walk_init(&walk, dst, src, nbytes); | 223 | blkcipher_walk_init(&walk, dst, src, nbytes); |
223 | err = blkcipher_walk_virt(desc, &walk); | 224 | err = blkcipher_walk_virt(desc, &walk); |
225 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
224 | 226 | ||
225 | kernel_fpu_begin(); | 227 | kernel_fpu_begin(); |
226 | while ((nbytes = walk.nbytes)) { | 228 | while ((nbytes = walk.nbytes)) { |
@@ -266,6 +268,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc, | |||
266 | 268 | ||
267 | blkcipher_walk_init(&walk, dst, src, nbytes); | 269 | blkcipher_walk_init(&walk, dst, src, nbytes); |
268 | err = blkcipher_walk_virt(desc, &walk); | 270 | err = blkcipher_walk_virt(desc, &walk); |
271 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
269 | 272 | ||
270 | kernel_fpu_begin(); | 273 | kernel_fpu_begin(); |
271 | while ((nbytes = walk.nbytes)) { | 274 | while ((nbytes = walk.nbytes)) { |
@@ -289,6 +292,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, | |||
289 | 292 | ||
290 | blkcipher_walk_init(&walk, dst, src, nbytes); | 293 | blkcipher_walk_init(&walk, dst, src, nbytes); |
291 | err = blkcipher_walk_virt(desc, &walk); | 294 | err = blkcipher_walk_virt(desc, &walk); |
295 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
292 | 296 | ||
293 | kernel_fpu_begin(); | 297 | kernel_fpu_begin(); |
294 | while ((nbytes = walk.nbytes)) { | 298 | while ((nbytes = walk.nbytes)) { |
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 5f9781a3815f..daef6cd2b45d 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c | |||
@@ -48,7 +48,7 @@ static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, | |||
48 | struct blkcipher_desc desc = { | 48 | struct blkcipher_desc desc = { |
49 | .tfm = child, | 49 | .tfm = child, |
50 | .info = desc_in->info, | 50 | .info = desc_in->info, |
51 | .flags = desc_in->flags, | 51 | .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, |
52 | }; | 52 | }; |
53 | 53 | ||
54 | kernel_fpu_begin(); | 54 | kernel_fpu_begin(); |
@@ -67,7 +67,7 @@ static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, | |||
67 | struct blkcipher_desc desc = { | 67 | struct blkcipher_desc desc = { |
68 | .tfm = child, | 68 | .tfm = child, |
69 | .info = desc_in->info, | 69 | .info = desc_in->info, |
70 | .flags = desc_in->flags, | 70 | .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, |
71 | }; | 71 | }; |
72 | 72 | ||
73 | kernel_fpu_begin(); | 73 | kernel_fpu_begin(); |
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4518dc500903..20d1465a2ab0 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h | |||
@@ -144,6 +144,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) | |||
144 | 144 | ||
145 | #else /* !CONFIG_ACPI */ | 145 | #else /* !CONFIG_ACPI */ |
146 | 146 | ||
147 | #define acpi_disabled 1 | ||
147 | #define acpi_lapic 0 | 148 | #define acpi_lapic 0 |
148 | #define acpi_ioapic 0 | 149 | #define acpi_ioapic 0 |
149 | static inline void acpi_noirq_set(void) { } | 150 | static inline void acpi_noirq_set(void) { } |
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 262e02820049..bdf96f119f06 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h | |||
@@ -29,9 +29,11 @@ extern void amd_iommu_detect(void); | |||
29 | extern irqreturn_t amd_iommu_int_handler(int irq, void *data); | 29 | extern irqreturn_t amd_iommu_int_handler(int irq, void *data); |
30 | extern void amd_iommu_flush_all_domains(void); | 30 | extern void amd_iommu_flush_all_domains(void); |
31 | extern void amd_iommu_flush_all_devices(void); | 31 | extern void amd_iommu_flush_all_devices(void); |
32 | extern void amd_iommu_shutdown(void); | ||
32 | #else | 33 | #else |
33 | static inline int amd_iommu_init(void) { return -ENODEV; } | 34 | static inline int amd_iommu_init(void) { return -ENODEV; } |
34 | static inline void amd_iommu_detect(void) { } | 35 | static inline void amd_iommu_detect(void) { } |
36 | static inline void amd_iommu_shutdown(void) { } | ||
35 | #endif | 37 | #endif |
36 | 38 | ||
37 | #endif /* _ASM_X86_AMD_IOMMU_H */ | 39 | #endif /* _ASM_X86_AMD_IOMMU_H */ |
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 8cb9c814e120..dc5a667ff791 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h | |||
@@ -19,7 +19,10 @@ | |||
19 | * | 19 | * |
20 | * Atomically reads the value of @v. | 20 | * Atomically reads the value of @v. |
21 | */ | 21 | */ |
22 | #define atomic_read(v) ((v)->counter) | 22 | static inline int atomic_read(const atomic_t *v) |
23 | { | ||
24 | return v->counter; | ||
25 | } | ||
23 | 26 | ||
24 | /** | 27 | /** |
25 | * atomic_set - set atomic variable | 28 | * atomic_set - set atomic variable |
@@ -28,7 +31,10 @@ | |||
28 | * | 31 | * |
29 | * Atomically sets the value of @v to @i. | 32 | * Atomically sets the value of @v to @i. |
30 | */ | 33 | */ |
31 | #define atomic_set(v, i) (((v)->counter) = (i)) | 34 | static inline void atomic_set(atomic_t *v, int i) |
35 | { | ||
36 | v->counter = i; | ||
37 | } | ||
32 | 38 | ||
33 | /** | 39 | /** |
34 | * atomic_add - add integer to atomic variable | 40 | * atomic_add - add integer to atomic variable |
@@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v) | |||
200 | return atomic_add_return(-i, v); | 206 | return atomic_add_return(-i, v); |
201 | } | 207 | } |
202 | 208 | ||
203 | #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) | 209 | static inline int atomic_cmpxchg(atomic_t *v, int old, int new) |
204 | #define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) | 210 | { |
211 | return cmpxchg(&v->counter, old, new); | ||
212 | } | ||
213 | |||
214 | static inline int atomic_xchg(atomic_t *v, int new) | ||
215 | { | ||
216 | return xchg(&v->counter, new); | ||
217 | } | ||
205 | 218 | ||
206 | /** | 219 | /** |
207 | * atomic_add_unless - add unless the number is already a given value | 220 | * atomic_add_unless - add unless the number is already a given value |
@@ -250,67 +263,22 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) | |||
250 | /* An 64bit atomic type */ | 263 | /* An 64bit atomic type */ |
251 | 264 | ||
252 | typedef struct { | 265 | typedef struct { |
253 | unsigned long long counter; | 266 | u64 __aligned(8) counter; |
254 | } atomic64_t; | 267 | } atomic64_t; |
255 | 268 | ||
256 | #define ATOMIC64_INIT(val) { (val) } | 269 | #define ATOMIC64_INIT(val) { (val) } |
257 | 270 | ||
258 | /** | 271 | extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val); |
259 | * atomic64_read - read atomic64 variable | ||
260 | * @v: pointer of type atomic64_t | ||
261 | * | ||
262 | * Atomically reads the value of @v. | ||
263 | * Doesn't imply a read memory barrier. | ||
264 | */ | ||
265 | #define __atomic64_read(ptr) ((ptr)->counter) | ||
266 | |||
267 | static inline unsigned long long | ||
268 | cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) | ||
269 | { | ||
270 | asm volatile( | ||
271 | |||
272 | LOCK_PREFIX "cmpxchg8b (%[ptr])\n" | ||
273 | |||
274 | : "=A" (old) | ||
275 | |||
276 | : [ptr] "D" (ptr), | ||
277 | "A" (old), | ||
278 | "b" (ll_low(new)), | ||
279 | "c" (ll_high(new)) | ||
280 | |||
281 | : "memory"); | ||
282 | |||
283 | return old; | ||
284 | } | ||
285 | |||
286 | static inline unsigned long long | ||
287 | atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, | ||
288 | unsigned long long new_val) | ||
289 | { | ||
290 | return cmpxchg8b(&ptr->counter, old_val, new_val); | ||
291 | } | ||
292 | 272 | ||
293 | /** | 273 | /** |
294 | * atomic64_xchg - xchg atomic64 variable | 274 | * atomic64_xchg - xchg atomic64 variable |
295 | * @ptr: pointer to type atomic64_t | 275 | * @ptr: pointer to type atomic64_t |
296 | * @new_val: value to assign | 276 | * @new_val: value to assign |
297 | * @old_val: old value that was there | ||
298 | * | 277 | * |
299 | * Atomically xchgs the value of @ptr to @new_val and returns | 278 | * Atomically xchgs the value of @ptr to @new_val and returns |
300 | * the old value. | 279 | * the old value. |
301 | */ | 280 | */ |
302 | 281 | extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val); | |
303 | static inline unsigned long long | ||
304 | atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) | ||
305 | { | ||
306 | unsigned long long old_val; | ||
307 | |||
308 | do { | ||
309 | old_val = atomic_read(ptr); | ||
310 | } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); | ||
311 | |||
312 | return old_val; | ||
313 | } | ||
314 | 282 | ||
315 | /** | 283 | /** |
316 | * atomic64_set - set atomic64 variable | 284 | * atomic64_set - set atomic64 variable |
@@ -319,10 +287,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) | |||
319 | * | 287 | * |
320 | * Atomically sets the value of @ptr to @new_val. | 288 | * Atomically sets the value of @ptr to @new_val. |
321 | */ | 289 | */ |
322 | static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) | 290 | extern void atomic64_set(atomic64_t *ptr, u64 new_val); |
323 | { | ||
324 | atomic64_xchg(ptr, new_val); | ||
325 | } | ||
326 | 291 | ||
327 | /** | 292 | /** |
328 | * atomic64_read - read atomic64 variable | 293 | * atomic64_read - read atomic64 variable |
@@ -330,17 +295,30 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) | |||
330 | * | 295 | * |
331 | * Atomically reads the value of @ptr and returns it. | 296 | * Atomically reads the value of @ptr and returns it. |
332 | */ | 297 | */ |
333 | static inline unsigned long long atomic64_read(atomic64_t *ptr) | 298 | static inline u64 atomic64_read(atomic64_t *ptr) |
334 | { | 299 | { |
335 | unsigned long long curr_val; | 300 | u64 res; |
336 | 301 | ||
337 | do { | 302 | /* |
338 | curr_val = __atomic64_read(ptr); | 303 | * Note, we inline this atomic64_t primitive because |
339 | } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); | 304 | * it only clobbers EAX/EDX and leaves the others |
340 | 305 | * untouched. We also (somewhat subtly) rely on the | |
341 | return curr_val; | 306 | * fact that cmpxchg8b returns the current 64-bit value |
307 | * of the memory location we are touching: | ||
308 | */ | ||
309 | asm volatile( | ||
310 | "mov %%ebx, %%eax\n\t" | ||
311 | "mov %%ecx, %%edx\n\t" | ||
312 | LOCK_PREFIX "cmpxchg8b %1\n" | ||
313 | : "=&A" (res) | ||
314 | : "m" (*ptr) | ||
315 | ); | ||
316 | |||
317 | return res; | ||
342 | } | 318 | } |
343 | 319 | ||
320 | extern u64 atomic64_read(atomic64_t *ptr); | ||
321 | |||
344 | /** | 322 | /** |
345 | * atomic64_add_return - add and return | 323 | * atomic64_add_return - add and return |
346 | * @delta: integer value to add | 324 | * @delta: integer value to add |
@@ -348,34 +326,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr) | |||
348 | * | 326 | * |
349 | * Atomically adds @delta to @ptr and returns @delta + *@ptr | 327 | * Atomically adds @delta to @ptr and returns @delta + *@ptr |
350 | */ | 328 | */ |
351 | static inline unsigned long long | 329 | extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr); |
352 | atomic64_add_return(unsigned long long delta, atomic64_t *ptr) | ||
353 | { | ||
354 | unsigned long long old_val, new_val; | ||
355 | |||
356 | do { | ||
357 | old_val = atomic_read(ptr); | ||
358 | new_val = old_val + delta; | ||
359 | |||
360 | } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); | ||
361 | |||
362 | return new_val; | ||
363 | } | ||
364 | |||
365 | static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) | ||
366 | { | ||
367 | return atomic64_add_return(-delta, ptr); | ||
368 | } | ||
369 | 330 | ||
370 | static inline long atomic64_inc_return(atomic64_t *ptr) | 331 | /* |
371 | { | 332 | * Other variants with different arithmetic operators: |
372 | return atomic64_add_return(1, ptr); | 333 | */ |
373 | } | 334 | extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr); |
374 | 335 | extern u64 atomic64_inc_return(atomic64_t *ptr); | |
375 | static inline long atomic64_dec_return(atomic64_t *ptr) | 336 | extern u64 atomic64_dec_return(atomic64_t *ptr); |
376 | { | ||
377 | return atomic64_sub_return(1, ptr); | ||
378 | } | ||
379 | 337 | ||
380 | /** | 338 | /** |
381 | * atomic64_add - add integer to atomic64 variable | 339 | * atomic64_add - add integer to atomic64 variable |
@@ -384,10 +342,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr) | |||
384 | * | 342 | * |
385 | * Atomically adds @delta to @ptr. | 343 | * Atomically adds @delta to @ptr. |
386 | */ | 344 | */ |
387 | static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) | 345 | extern void atomic64_add(u64 delta, atomic64_t *ptr); |
388 | { | ||
389 | atomic64_add_return(delta, ptr); | ||
390 | } | ||
391 | 346 | ||
392 | /** | 347 | /** |
393 | * atomic64_sub - subtract the atomic64 variable | 348 | * atomic64_sub - subtract the atomic64 variable |
@@ -396,10 +351,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) | |||
396 | * | 351 | * |
397 | * Atomically subtracts @delta from @ptr. | 352 | * Atomically subtracts @delta from @ptr. |
398 | */ | 353 | */ |
399 | static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) | 354 | extern void atomic64_sub(u64 delta, atomic64_t *ptr); |
400 | { | ||
401 | atomic64_add(-delta, ptr); | ||
402 | } | ||
403 | 355 | ||
404 | /** | 356 | /** |
405 | * atomic64_sub_and_test - subtract value from variable and test result | 357 | * atomic64_sub_and_test - subtract value from variable and test result |
@@ -410,13 +362,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) | |||
410 | * true if the result is zero, or false for all | 362 | * true if the result is zero, or false for all |
411 | * other cases. | 363 | * other cases. |
412 | */ | 364 | */ |
413 | static inline int | 365 | extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr); |
414 | atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) | ||
415 | { | ||
416 | unsigned long long old_val = atomic64_sub_return(delta, ptr); | ||
417 | |||
418 | return old_val == 0; | ||
419 | } | ||
420 | 366 | ||
421 | /** | 367 | /** |
422 | * atomic64_inc - increment atomic64 variable | 368 | * atomic64_inc - increment atomic64 variable |
@@ -424,10 +370,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) | |||
424 | * | 370 | * |
425 | * Atomically increments @ptr by 1. | 371 | * Atomically increments @ptr by 1. |
426 | */ | 372 | */ |
427 | static inline void atomic64_inc(atomic64_t *ptr) | 373 | extern void atomic64_inc(atomic64_t *ptr); |
428 | { | ||
429 | atomic64_add(1, ptr); | ||
430 | } | ||
431 | 374 | ||
432 | /** | 375 | /** |
433 | * atomic64_dec - decrement atomic64 variable | 376 | * atomic64_dec - decrement atomic64 variable |
@@ -435,10 +378,7 @@ static inline void atomic64_inc(atomic64_t *ptr) | |||
435 | * | 378 | * |
436 | * Atomically decrements @ptr by 1. | 379 | * Atomically decrements @ptr by 1. |
437 | */ | 380 | */ |
438 | static inline void atomic64_dec(atomic64_t *ptr) | 381 | extern void atomic64_dec(atomic64_t *ptr); |
439 | { | ||
440 | atomic64_sub(1, ptr); | ||
441 | } | ||
442 | 382 | ||
443 | /** | 383 | /** |
444 | * atomic64_dec_and_test - decrement and test | 384 | * atomic64_dec_and_test - decrement and test |
@@ -448,10 +388,7 @@ static inline void atomic64_dec(atomic64_t *ptr) | |||
448 | * returns true if the result is 0, or false for all other | 388 | * returns true if the result is 0, or false for all other |
449 | * cases. | 389 | * cases. |
450 | */ | 390 | */ |
451 | static inline int atomic64_dec_and_test(atomic64_t *ptr) | 391 | extern int atomic64_dec_and_test(atomic64_t *ptr); |
452 | { | ||
453 | return atomic64_sub_and_test(1, ptr); | ||
454 | } | ||
455 | 392 | ||
456 | /** | 393 | /** |
457 | * atomic64_inc_and_test - increment and test | 394 | * atomic64_inc_and_test - increment and test |
@@ -461,10 +398,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr) | |||
461 | * and returns true if the result is zero, or false for all | 398 | * and returns true if the result is zero, or false for all |
462 | * other cases. | 399 | * other cases. |
463 | */ | 400 | */ |
464 | static inline int atomic64_inc_and_test(atomic64_t *ptr) | 401 | extern int atomic64_inc_and_test(atomic64_t *ptr); |
465 | { | ||
466 | return atomic64_sub_and_test(-1, ptr); | ||
467 | } | ||
468 | 402 | ||
469 | /** | 403 | /** |
470 | * atomic64_add_negative - add and test if negative | 404 | * atomic64_add_negative - add and test if negative |
@@ -475,13 +409,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr) | |||
475 | * if the result is negative, or false when | 409 | * if the result is negative, or false when |
476 | * result is greater than or equal to zero. | 410 | * result is greater than or equal to zero. |
477 | */ | 411 | */ |
478 | static inline int | 412 | extern int atomic64_add_negative(u64 delta, atomic64_t *ptr); |
479 | atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) | ||
480 | { | ||
481 | long long old_val = atomic64_add_return(delta, ptr); | ||
482 | |||
483 | return old_val < 0; | ||
484 | } | ||
485 | 413 | ||
486 | #include <asm-generic/atomic-long.h> | 414 | #include <asm-generic/atomic-long.h> |
487 | #endif /* _ASM_X86_ATOMIC_32_H */ | 415 | #endif /* _ASM_X86_ATOMIC_32_H */ |
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 0d6360220007..d605dc268e79 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h | |||
@@ -18,7 +18,10 @@ | |||
18 | * | 18 | * |
19 | * Atomically reads the value of @v. | 19 | * Atomically reads the value of @v. |
20 | */ | 20 | */ |
21 | #define atomic_read(v) ((v)->counter) | 21 | static inline int atomic_read(const atomic_t *v) |
22 | { | ||
23 | return v->counter; | ||
24 | } | ||
22 | 25 | ||
23 | /** | 26 | /** |
24 | * atomic_set - set atomic variable | 27 | * atomic_set - set atomic variable |
@@ -27,7 +30,10 @@ | |||
27 | * | 30 | * |
28 | * Atomically sets the value of @v to @i. | 31 | * Atomically sets the value of @v to @i. |
29 | */ | 32 | */ |
30 | #define atomic_set(v, i) (((v)->counter) = (i)) | 33 | static inline void atomic_set(atomic_t *v, int i) |
34 | { | ||
35 | v->counter = i; | ||
36 | } | ||
31 | 37 | ||
32 | /** | 38 | /** |
33 | * atomic_add - add integer to atomic variable | 39 | * atomic_add - add integer to atomic variable |
@@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) | |||
192 | * Atomically reads the value of @v. | 198 | * Atomically reads the value of @v. |
193 | * Doesn't imply a read memory barrier. | 199 | * Doesn't imply a read memory barrier. |
194 | */ | 200 | */ |
195 | #define atomic64_read(v) ((v)->counter) | 201 | static inline long atomic64_read(const atomic64_t *v) |
202 | { | ||
203 | return v->counter; | ||
204 | } | ||
196 | 205 | ||
197 | /** | 206 | /** |
198 | * atomic64_set - set atomic64 variable | 207 | * atomic64_set - set atomic64 variable |
@@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v) | |||
201 | * | 210 | * |
202 | * Atomically sets the value of @v to @i. | 211 | * Atomically sets the value of @v to @i. |
203 | */ | 212 | */ |
204 | #define atomic64_set(v, i) (((v)->counter) = (i)) | 213 | static inline void atomic64_set(atomic64_t *v, long i) |
214 | { | ||
215 | v->counter = i; | ||
216 | } | ||
205 | 217 | ||
206 | /** | 218 | /** |
207 | * atomic64_add - add integer to atomic64 variable | 219 | * atomic64_add - add integer to atomic64 variable |
@@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v) | |||
355 | #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) | 367 | #define atomic64_inc_return(v) (atomic64_add_return(1, (v))) |
356 | #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) | 368 | #define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) |
357 | 369 | ||
358 | #define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) | 370 | static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) |
359 | #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) | 371 | { |
372 | return cmpxchg(&v->counter, old, new); | ||
373 | } | ||
374 | |||
375 | static inline long atomic64_xchg(atomic64_t *v, long new) | ||
376 | { | ||
377 | return xchg(&v->counter, new); | ||
378 | } | ||
360 | 379 | ||
361 | #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) | 380 | static inline long atomic_cmpxchg(atomic_t *v, int old, int new) |
362 | #define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) | 381 | { |
382 | return cmpxchg(&v->counter, old, new); | ||
383 | } | ||
384 | |||
385 | static inline long atomic_xchg(atomic_t *v, int new) | ||
386 | { | ||
387 | return xchg(&v->counter, new); | ||
388 | } | ||
363 | 389 | ||
364 | /** | 390 | /** |
365 | * atomic_add_unless - add unless the number is a given value | 391 | * atomic_add_unless - add unless the number is a given value |
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 418e632d4a80..7a1065958ba9 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h | |||
@@ -8,7 +8,7 @@ | |||
8 | 8 | ||
9 | #ifdef __KERNEL__ | 9 | #ifdef __KERNEL__ |
10 | 10 | ||
11 | #include <asm/page_types.h> | 11 | #include <asm/pgtable_types.h> |
12 | 12 | ||
13 | /* Physical address where kernel should be loaded. */ | 13 | /* Physical address where kernel should be loaded. */ |
14 | #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ | 14 | #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ |
@@ -16,10 +16,10 @@ | |||
16 | & ~(CONFIG_PHYSICAL_ALIGN - 1)) | 16 | & ~(CONFIG_PHYSICAL_ALIGN - 1)) |
17 | 17 | ||
18 | /* Minimum kernel alignment, as a power of two */ | 18 | /* Minimum kernel alignment, as a power of two */ |
19 | #ifdef CONFIG_x86_64 | 19 | #ifdef CONFIG_X86_64 |
20 | #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT | 20 | #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT |
21 | #else | 21 | #else |
22 | #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) | 22 | #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) |
23 | #endif | 23 | #endif |
24 | #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) | 24 | #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) |
25 | 25 | ||
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c45f415ce315..c993e9e0fed4 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -1,7 +1,6 @@ | |||
1 | #ifndef _ASM_X86_DESC_H | 1 | #ifndef _ASM_X86_DESC_H |
2 | #define _ASM_X86_DESC_H | 2 | #define _ASM_X86_DESC_H |
3 | 3 | ||
4 | #ifndef __ASSEMBLY__ | ||
5 | #include <asm/desc_defs.h> | 4 | #include <asm/desc_defs.h> |
6 | #include <asm/ldt.h> | 5 | #include <asm/ldt.h> |
7 | #include <asm/mmu.h> | 6 | #include <asm/mmu.h> |
@@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) | |||
380 | _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); | 379 | _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); |
381 | } | 380 | } |
382 | 381 | ||
383 | #else | ||
384 | /* | ||
385 | * GET_DESC_BASE reads the descriptor base of the specified segment. | ||
386 | * | ||
387 | * Args: | ||
388 | * idx - descriptor index | ||
389 | * gdt - GDT pointer | ||
390 | * base - 32bit register to which the base will be written | ||
391 | * lo_w - lo word of the "base" register | ||
392 | * lo_b - lo byte of the "base" register | ||
393 | * hi_b - hi byte of the low word of the "base" register | ||
394 | * | ||
395 | * Example: | ||
396 | * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) | ||
397 | * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. | ||
398 | */ | ||
399 | #define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ | ||
400 | movb idx * 8 + 4(gdt), lo_b; \ | ||
401 | movb idx * 8 + 7(gdt), hi_b; \ | ||
402 | shll $16, base; \ | ||
403 | movw idx * 8 + 2(gdt), lo_w; | ||
404 | |||
405 | |||
406 | #endif /* __ASSEMBLY__ */ | ||
407 | |||
408 | #endif /* _ASM_X86_DESC_H */ | 382 | #endif /* _ASM_X86_DESC_H */ |
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index b93405b228b4..1c3f9435f1c9 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h | |||
@@ -33,6 +33,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) | |||
33 | #endif | 33 | #endif |
34 | } | 34 | } |
35 | 35 | ||
36 | #include <asm-generic/dma-mapping-common.h> | ||
37 | |||
36 | /* Make sure we keep the same behaviour */ | 38 | /* Make sure we keep the same behaviour */ |
37 | static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) | 39 | static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) |
38 | { | 40 | { |
@@ -53,177 +55,6 @@ extern int dma_set_mask(struct device *dev, u64 mask); | |||
53 | extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, | 55 | extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, |
54 | dma_addr_t *dma_addr, gfp_t flag); | 56 | dma_addr_t *dma_addr, gfp_t flag); |
55 | 57 | ||
56 | static inline dma_addr_t | ||
57 | dma_map_single(struct device *hwdev, void *ptr, size_t size, | ||
58 | enum dma_data_direction dir) | ||
59 | { | ||
60 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
61 | dma_addr_t addr; | ||
62 | |||
63 | BUG_ON(!valid_dma_direction(dir)); | ||
64 | kmemcheck_mark_initialized(ptr, size); | ||
65 | addr = ops->map_page(hwdev, virt_to_page(ptr), | ||
66 | (unsigned long)ptr & ~PAGE_MASK, size, | ||
67 | dir, NULL); | ||
68 | debug_dma_map_page(hwdev, virt_to_page(ptr), | ||
69 | (unsigned long)ptr & ~PAGE_MASK, size, | ||
70 | dir, addr, true); | ||
71 | return addr; | ||
72 | } | ||
73 | |||
74 | static inline void | ||
75 | dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, | ||
76 | enum dma_data_direction dir) | ||
77 | { | ||
78 | struct dma_map_ops *ops = get_dma_ops(dev); | ||
79 | |||
80 | BUG_ON(!valid_dma_direction(dir)); | ||
81 | if (ops->unmap_page) | ||
82 | ops->unmap_page(dev, addr, size, dir, NULL); | ||
83 | debug_dma_unmap_page(dev, addr, size, dir, true); | ||
84 | } | ||
85 | |||
86 | static inline int | ||
87 | dma_map_sg(struct device *hwdev, struct scatterlist *sg, | ||
88 | int nents, enum dma_data_direction dir) | ||
89 | { | ||
90 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
91 | int ents; | ||
92 | struct scatterlist *s; | ||
93 | int i; | ||
94 | |||
95 | BUG_ON(!valid_dma_direction(dir)); | ||
96 | for_each_sg(sg, s, nents, i) | ||
97 | kmemcheck_mark_initialized(sg_virt(s), s->length); | ||
98 | ents = ops->map_sg(hwdev, sg, nents, dir, NULL); | ||
99 | debug_dma_map_sg(hwdev, sg, nents, ents, dir); | ||
100 | |||
101 | return ents; | ||
102 | } | ||
103 | |||
104 | static inline void | ||
105 | dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, | ||
106 | enum dma_data_direction dir) | ||
107 | { | ||
108 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
109 | |||
110 | BUG_ON(!valid_dma_direction(dir)); | ||
111 | debug_dma_unmap_sg(hwdev, sg, nents, dir); | ||
112 | if (ops->unmap_sg) | ||
113 | ops->unmap_sg(hwdev, sg, nents, dir, NULL); | ||
114 | } | ||
115 | |||
116 | static inline void | ||
117 | dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle, | ||
118 | size_t size, enum dma_data_direction dir) | ||
119 | { | ||
120 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
121 | |||
122 | BUG_ON(!valid_dma_direction(dir)); | ||
123 | if (ops->sync_single_for_cpu) | ||
124 | ops->sync_single_for_cpu(hwdev, dma_handle, size, dir); | ||
125 | debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir); | ||
126 | flush_write_buffers(); | ||
127 | } | ||
128 | |||
129 | static inline void | ||
130 | dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle, | ||
131 | size_t size, enum dma_data_direction dir) | ||
132 | { | ||
133 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
134 | |||
135 | BUG_ON(!valid_dma_direction(dir)); | ||
136 | if (ops->sync_single_for_device) | ||
137 | ops->sync_single_for_device(hwdev, dma_handle, size, dir); | ||
138 | debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir); | ||
139 | flush_write_buffers(); | ||
140 | } | ||
141 | |||
142 | static inline void | ||
143 | dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle, | ||
144 | unsigned long offset, size_t size, | ||
145 | enum dma_data_direction dir) | ||
146 | { | ||
147 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
148 | |||
149 | BUG_ON(!valid_dma_direction(dir)); | ||
150 | if (ops->sync_single_range_for_cpu) | ||
151 | ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, | ||
152 | size, dir); | ||
153 | debug_dma_sync_single_range_for_cpu(hwdev, dma_handle, | ||
154 | offset, size, dir); | ||
155 | flush_write_buffers(); | ||
156 | } | ||
157 | |||
158 | static inline void | ||
159 | dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle, | ||
160 | unsigned long offset, size_t size, | ||
161 | enum dma_data_direction dir) | ||
162 | { | ||
163 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
164 | |||
165 | BUG_ON(!valid_dma_direction(dir)); | ||
166 | if (ops->sync_single_range_for_device) | ||
167 | ops->sync_single_range_for_device(hwdev, dma_handle, | ||
168 | offset, size, dir); | ||
169 | debug_dma_sync_single_range_for_device(hwdev, dma_handle, | ||
170 | offset, size, dir); | ||
171 | flush_write_buffers(); | ||
172 | } | ||
173 | |||
174 | static inline void | ||
175 | dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, | ||
176 | int nelems, enum dma_data_direction dir) | ||
177 | { | ||
178 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
179 | |||
180 | BUG_ON(!valid_dma_direction(dir)); | ||
181 | if (ops->sync_sg_for_cpu) | ||
182 | ops->sync_sg_for_cpu(hwdev, sg, nelems, dir); | ||
183 | debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir); | ||
184 | flush_write_buffers(); | ||
185 | } | ||
186 | |||
187 | static inline void | ||
188 | dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, | ||
189 | int nelems, enum dma_data_direction dir) | ||
190 | { | ||
191 | struct dma_map_ops *ops = get_dma_ops(hwdev); | ||
192 | |||
193 | BUG_ON(!valid_dma_direction(dir)); | ||
194 | if (ops->sync_sg_for_device) | ||
195 | ops->sync_sg_for_device(hwdev, sg, nelems, dir); | ||
196 | debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir); | ||
197 | |||
198 | flush_write_buffers(); | ||
199 | } | ||
200 | |||
201 | static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, | ||
202 | size_t offset, size_t size, | ||
203 | enum dma_data_direction dir) | ||
204 | { | ||
205 | struct dma_map_ops *ops = get_dma_ops(dev); | ||
206 | dma_addr_t addr; | ||
207 | |||
208 | BUG_ON(!valid_dma_direction(dir)); | ||
209 | kmemcheck_mark_initialized(page_address(page) + offset, size); | ||
210 | addr = ops->map_page(dev, page, offset, size, dir, NULL); | ||
211 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); | ||
212 | |||
213 | return addr; | ||
214 | } | ||
215 | |||
216 | static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, | ||
217 | size_t size, enum dma_data_direction dir) | ||
218 | { | ||
219 | struct dma_map_ops *ops = get_dma_ops(dev); | ||
220 | |||
221 | BUG_ON(!valid_dma_direction(dir)); | ||
222 | if (ops->unmap_page) | ||
223 | ops->unmap_page(dev, addr, size, dir, NULL); | ||
224 | debug_dma_unmap_page(dev, addr, size, dir, false); | ||
225 | } | ||
226 | |||
227 | static inline void | 58 | static inline void |
228 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, | 59 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, |
229 | enum dma_data_direction dir) | 60 | enum dma_data_direction dir) |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index edc90f23e708..8406ed7f9926 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); | |||
33 | #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ | 33 | #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ |
34 | efi_call_virt(f, a1, a2, a3, a4, a5, a6) | 34 | efi_call_virt(f, a1, a2, a3, a4, a5, a6) |
35 | 35 | ||
36 | #define efi_ioremap(addr, size) ioremap_cache(addr, size) | 36 | #define efi_ioremap(addr, size, type) ioremap_cache(addr, size) |
37 | 37 | ||
38 | #else /* !CONFIG_X86_32 */ | 38 | #else /* !CONFIG_X86_32 */ |
39 | 39 | ||
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, | |||
84 | efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ | 84 | efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ |
85 | (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) | 85 | (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) |
86 | 86 | ||
87 | extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); | 87 | extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, |
88 | u32 type); | ||
88 | 89 | ||
89 | #endif /* CONFIG_X86_32 */ | 90 | #endif /* CONFIG_X86_32 */ |
90 | 91 | ||
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 2d81af3974a0..7b2d71df39a6 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -111,12 +111,9 @@ enum fixed_addresses { | |||
111 | #ifdef CONFIG_PARAVIRT | 111 | #ifdef CONFIG_PARAVIRT |
112 | FIX_PARAVIRT_BOOTMAP, | 112 | FIX_PARAVIRT_BOOTMAP, |
113 | #endif | 113 | #endif |
114 | FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ | 114 | FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ |
115 | FIX_TEXT_POKE1, | 115 | FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ |
116 | __end_of_permanent_fixed_addresses, | 116 | __end_of_permanent_fixed_addresses, |
117 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | ||
118 | FIX_OHCI1394_BASE, | ||
119 | #endif | ||
120 | /* | 117 | /* |
121 | * 256 temporary boot-time mappings, used by early_ioremap(), | 118 | * 256 temporary boot-time mappings, used by early_ioremap(), |
122 | * before ioremap() is functional. | 119 | * before ioremap() is functional. |
@@ -129,6 +126,9 @@ enum fixed_addresses { | |||
129 | FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - | 126 | FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - |
130 | (__end_of_permanent_fixed_addresses & 255), | 127 | (__end_of_permanent_fixed_addresses & 255), |
131 | FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, | 128 | FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, |
129 | #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT | ||
130 | FIX_OHCI1394_BASE, | ||
131 | #endif | ||
132 | #ifdef CONFIG_X86_32 | 132 | #ifdef CONFIG_X86_32 |
133 | FIX_WP_TEST, | 133 | FIX_WP_TEST, |
134 | #endif | 134 | #endif |
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index bd2c6511c887..db24c2278be0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h | |||
@@ -28,13 +28,6 @@ | |||
28 | 28 | ||
29 | #endif | 29 | #endif |
30 | 30 | ||
31 | /* FIXME: I don't want to stay hardcoded */ | ||
32 | #ifdef CONFIG_X86_64 | ||
33 | # define FTRACE_SYSCALL_MAX 296 | ||
34 | #else | ||
35 | # define FTRACE_SYSCALL_MAX 333 | ||
36 | #endif | ||
37 | |||
38 | #ifdef CONFIG_FUNCTION_TRACER | 31 | #ifdef CONFIG_FUNCTION_TRACER |
39 | #define MCOUNT_ADDR ((long)(mcount)) | 32 | #define MCOUNT_ADDR ((long)(mcount)) |
40 | #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ | 33 | #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ |
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index daf866ed0612..330ee807f89e 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h | |||
@@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq, | |||
161 | struct io_apic_irq_attr *irq_attr); | 161 | struct io_apic_irq_attr *irq_attr); |
162 | extern int (*ioapic_renumber_irq)(int ioapic, int irq); | 162 | extern int (*ioapic_renumber_irq)(int ioapic, int irq); |
163 | extern void ioapic_init_mappings(void); | 163 | extern void ioapic_init_mappings(void); |
164 | extern void ioapic_insert_resources(void); | ||
164 | 165 | ||
165 | extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); | 166 | extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); |
166 | extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); | 167 | extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); |
@@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin, | |||
180 | #define io_apic_assign_pci_irqs 0 | 181 | #define io_apic_assign_pci_irqs 0 |
181 | static const int timer_through_8259 = 0; | 182 | static const int timer_through_8259 = 0; |
182 | static inline void ioapic_init_mappings(void) { } | 183 | static inline void ioapic_init_mappings(void) { } |
184 | static inline void ioapic_insert_resources(void) { } | ||
183 | 185 | ||
184 | static inline void probe_nr_irqs_gsi(void) { } | 186 | static inline void probe_nr_irqs_gsi(void) { } |
185 | #endif | 187 | #endif |
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index af326a2975b5..fd6d21bbee6c 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h | |||
@@ -6,6 +6,7 @@ extern void no_iommu_init(void); | |||
6 | extern struct dma_map_ops nommu_dma_ops; | 6 | extern struct dma_map_ops nommu_dma_ops; |
7 | extern int force_iommu, no_iommu; | 7 | extern int force_iommu, no_iommu; |
8 | extern int iommu_detected; | 8 | extern int iommu_detected; |
9 | extern int iommu_pass_through; | ||
9 | 10 | ||
10 | /* 10 seconds */ | 11 | /* 10 seconds */ |
11 | #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) | 12 | #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) |
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2bdab21f0898..c6ccbe7e81ad 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void) | |||
12 | { | 12 | { |
13 | unsigned long flags; | 13 | unsigned long flags; |
14 | 14 | ||
15 | /* | ||
16 | * Note: this needs to be "=r" not "=rm", because we have the | ||
17 | * stack offset from what gcc expects at the time the "pop" is | ||
18 | * executed, and so a memory reference with respect to the stack | ||
19 | * would end up using the wrong address. | ||
20 | */ | ||
15 | asm volatile("# __raw_save_flags\n\t" | 21 | asm volatile("# __raw_save_flags\n\t" |
16 | "pushf ; pop %0" | 22 | "pushf ; pop %0" |
17 | : "=g" (flags) | 23 | : "=r" (flags) |
18 | : /* no input */ | 24 | : /* no input */ |
19 | : "memory"); | 25 | : "memory"); |
20 | 26 | ||
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d2..5136dad57cbb 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
@@ -17,8 +17,7 @@ | |||
17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Pages for switcher itself, then two pages per cpu */ |
18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) |
19 | 19 | ||
20 | /* We map at -4M (-2M when PAE is activated) for ease of mapping | 20 | /* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ |
21 | * into the guest (one PTE page). */ | ||
22 | #ifdef CONFIG_X86_PAE | 21 | #ifdef CONFIG_X86_PAE |
23 | #define SWITCHER_ADDR 0xFFE00000 | 22 | #define SWITCHER_ADDR 0xFFE00000 |
24 | #else | 23 | #else |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index d31c4a684078..ba0eed8aa1a6 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
@@ -30,27 +30,27 @@ | |||
30 | #include <asm/hw_irq.h> | 30 | #include <asm/hw_irq.h> |
31 | #include <asm/kvm_para.h> | 31 | #include <asm/kvm_para.h> |
32 | 32 | ||
33 | /*G:031 But first, how does our Guest contact the Host to ask for privileged | 33 | /*G:030 |
34 | * But first, how does our Guest contact the Host to ask for privileged | ||
34 | * operations? There are two ways: the direct way is to make a "hypercall", | 35 | * operations? There are two ways: the direct way is to make a "hypercall", |
35 | * to make requests of the Host Itself. | 36 | * to make requests of the Host Itself. |
36 | * | 37 | * |
37 | * We use the KVM hypercall mechanism. Seventeen hypercalls are | 38 | * We use the KVM hypercall mechanism, though completely different hypercall |
38 | * available: the hypercall number is put in the %eax register, and the | 39 | * numbers. Seventeen hypercalls are available: the hypercall number is put in |
39 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. | 40 | * the %eax register, and the arguments (when required) are placed in %ebx, |
40 | * If a return value makes sense, it's returned in %eax. | 41 | * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. |
41 | * | 42 | * |
42 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 43 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
43 | * Host, rather than returning failure. This reflects Winston Churchill's | 44 | * Host, rather than returning failure. This reflects Winston Churchill's |
44 | * definition of a gentleman: "someone who is only rude intentionally". */ | 45 | * definition of a gentleman: "someone who is only rude intentionally". |
45 | /*:*/ | 46 | :*/ |
46 | 47 | ||
47 | /* Can't use our min() macro here: needs to be a constant */ | 48 | /* Can't use our min() macro here: needs to be a constant */ |
48 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) | 49 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) |
49 | 50 | ||
50 | #define LHCALL_RING_SIZE 64 | 51 | #define LHCALL_RING_SIZE 64 |
51 | struct hcall_args { | 52 | struct hcall_args { |
52 | /* These map directly onto eax, ebx, ecx, edx and esi | 53 | /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ |
53 | * in struct lguest_regs */ | ||
54 | unsigned long arg0, arg1, arg2, arg3, arg4; | 54 | unsigned long arg0, arg1, arg2, arg3, arg4; |
55 | }; | 55 | }; |
56 | 56 | ||
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 540a466e50f5..5cdd8d100ec9 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -102,15 +102,39 @@ struct mce_log { | |||
102 | 102 | ||
103 | #ifdef __KERNEL__ | 103 | #ifdef __KERNEL__ |
104 | 104 | ||
105 | #include <linux/percpu.h> | ||
106 | #include <linux/init.h> | ||
107 | #include <asm/atomic.h> | ||
108 | |||
105 | extern int mce_disabled; | 109 | extern int mce_disabled; |
110 | extern int mce_p5_enabled; | ||
106 | 111 | ||
107 | #include <asm/atomic.h> | 112 | #ifdef CONFIG_X86_MCE |
108 | #include <linux/percpu.h> | 113 | void mcheck_init(struct cpuinfo_x86 *c); |
114 | #else | ||
115 | static inline void mcheck_init(struct cpuinfo_x86 *c) {} | ||
116 | #endif | ||
117 | |||
118 | #ifdef CONFIG_X86_OLD_MCE | ||
119 | extern int nr_mce_banks; | ||
120 | void amd_mcheck_init(struct cpuinfo_x86 *c); | ||
121 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); | ||
122 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); | ||
123 | #endif | ||
124 | |||
125 | #ifdef CONFIG_X86_ANCIENT_MCE | ||
126 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | ||
127 | void winchip_mcheck_init(struct cpuinfo_x86 *c); | ||
128 | static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } | ||
129 | #else | ||
130 | static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} | ||
131 | static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} | ||
132 | static inline void enable_p5_mce(void) {} | ||
133 | #endif | ||
109 | 134 | ||
110 | void mce_setup(struct mce *m); | 135 | void mce_setup(struct mce *m); |
111 | void mce_log(struct mce *m); | 136 | void mce_log(struct mce *m); |
112 | DECLARE_PER_CPU(struct sys_device, mce_dev); | 137 | DECLARE_PER_CPU(struct sys_device, mce_dev); |
113 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | ||
114 | 138 | ||
115 | /* | 139 | /* |
116 | * To support more than 128 would need to escape the predefined | 140 | * To support more than 128 would need to escape the predefined |
@@ -145,12 +169,8 @@ int mce_available(struct cpuinfo_x86 *c); | |||
145 | DECLARE_PER_CPU(unsigned, mce_exception_count); | 169 | DECLARE_PER_CPU(unsigned, mce_exception_count); |
146 | DECLARE_PER_CPU(unsigned, mce_poll_count); | 170 | DECLARE_PER_CPU(unsigned, mce_poll_count); |
147 | 171 | ||
148 | void mce_log_therm_throt_event(__u64 status); | ||
149 | |||
150 | extern atomic_t mce_entry; | 172 | extern atomic_t mce_entry; |
151 | 173 | ||
152 | void do_machine_check(struct pt_regs *, long); | ||
153 | |||
154 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); | 174 | typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); |
155 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); | 175 | DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); |
156 | 176 | ||
@@ -167,13 +187,32 @@ void mce_notify_process(void); | |||
167 | DECLARE_PER_CPU(struct mce, injectm); | 187 | DECLARE_PER_CPU(struct mce, injectm); |
168 | extern struct file_operations mce_chrdev_ops; | 188 | extern struct file_operations mce_chrdev_ops; |
169 | 189 | ||
170 | #ifdef CONFIG_X86_MCE | 190 | /* |
171 | void mcheck_init(struct cpuinfo_x86 *c); | 191 | * Exception handler |
172 | #else | 192 | */ |
173 | #define mcheck_init(c) do { } while (0) | 193 | |
174 | #endif | 194 | /* Call the installed machine check handler for this CPU setup. */ |
195 | extern void (*machine_check_vector)(struct pt_regs *, long error_code); | ||
196 | void do_machine_check(struct pt_regs *, long); | ||
197 | |||
198 | /* | ||
199 | * Threshold handler | ||
200 | */ | ||
175 | 201 | ||
176 | extern void (*mce_threshold_vector)(void); | 202 | extern void (*mce_threshold_vector)(void); |
203 | extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | ||
204 | |||
205 | /* | ||
206 | * Thermal handler | ||
207 | */ | ||
208 | |||
209 | void intel_init_thermal(struct cpuinfo_x86 *c); | ||
210 | |||
211 | #ifdef CONFIG_X86_NEW_MCE | ||
212 | void mce_log_therm_throt_event(__u64 status); | ||
213 | #else | ||
214 | static inline void mce_log_therm_throt_event(__u64 status) {} | ||
215 | #endif | ||
177 | 216 | ||
178 | #endif /* __KERNEL__ */ | 217 | #endif /* __KERNEL__ */ |
179 | #endif /* _ASM_X86_MCE_H */ | 218 | #endif /* _ASM_X86_MCE_H */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e3..6be7fc254b59 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -246,10 +246,6 @@ | |||
246 | #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) | 246 | #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) |
247 | #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) | 247 | #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) |
248 | 248 | ||
249 | /* Intel Model 6 */ | ||
250 | #define MSR_P6_EVNTSEL0 0x00000186 | ||
251 | #define MSR_P6_EVNTSEL1 0x00000187 | ||
252 | |||
253 | /* P4/Xeon+ specific */ | 249 | /* P4/Xeon+ specific */ |
254 | #define MSR_IA32_MCG_EAX 0x00000180 | 250 | #define MSR_IA32_MCG_EAX 0x00000180 |
255 | #define MSR_IA32_MCG_EBX 0x00000181 | 251 | #define MSR_IA32_MCG_EBX 0x00000181 |
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 22603764e7db..48ad9d29484a 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h | |||
@@ -3,13 +3,10 @@ | |||
3 | 3 | ||
4 | #include <asm/msr-index.h> | 4 | #include <asm/msr-index.h> |
5 | 5 | ||
6 | #ifndef __ASSEMBLY__ | ||
7 | # include <linux/types.h> | ||
8 | #endif | ||
9 | |||
10 | #ifdef __KERNEL__ | 6 | #ifdef __KERNEL__ |
11 | #ifndef __ASSEMBLY__ | 7 | #ifndef __ASSEMBLY__ |
12 | 8 | ||
9 | #include <linux/types.h> | ||
13 | #include <asm/asm.h> | 10 | #include <asm/asm.h> |
14 | #include <asm/errno.h> | 11 | #include <asm/errno.h> |
15 | #include <asm/cpumask.h> | 12 | #include <asm/cpumask.h> |
@@ -264,6 +261,4 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | |||
264 | #endif /* CONFIG_SMP */ | 261 | #endif /* CONFIG_SMP */ |
265 | #endif /* __ASSEMBLY__ */ | 262 | #endif /* __ASSEMBLY__ */ |
266 | #endif /* __KERNEL__ */ | 263 | #endif /* __KERNEL__ */ |
267 | |||
268 | |||
269 | #endif /* _ASM_X86_MSR_H */ | 264 | #endif /* _ASM_X86_MSR_H */ |
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c97264409934..c86e5ed4af51 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h | |||
@@ -72,7 +72,6 @@ void lapic_watchdog_stop(void); | |||
72 | int lapic_watchdog_init(unsigned nmi_hz); | 72 | int lapic_watchdog_init(unsigned nmi_hz); |
73 | int lapic_wd_event(unsigned nmi_hz); | 73 | int lapic_wd_event(unsigned nmi_hz); |
74 | unsigned lapic_adjust_nmi_hz(unsigned hz); | 74 | unsigned lapic_adjust_nmi_hz(unsigned hz); |
75 | int lapic_watchdog_ok(void); | ||
76 | void disable_lapic_nmi_watchdog(void); | 75 | void disable_lapic_nmi_watchdog(void); |
77 | void enable_lapic_nmi_watchdog(void); | 76 | void enable_lapic_nmi_watchdog(void); |
78 | void stop_nmi(void); | 77 | void stop_nmi(void); |
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8d382d3abf38..7639dbf5d223 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h | |||
@@ -41,7 +41,7 @@ | |||
41 | 41 | ||
42 | /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ | 42 | /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ |
43 | #define __PHYSICAL_MASK_SHIFT 46 | 43 | #define __PHYSICAL_MASK_SHIFT 46 |
44 | #define __VIRTUAL_MASK_SHIFT 48 | 44 | #define __VIRTUAL_MASK_SHIFT 47 |
45 | 45 | ||
46 | /* | 46 | /* |
47 | * Kernel image size is limited to 512 MB (see level2_kernel_pgt in | 47 | * Kernel image size is limited to 512 MB (see level2_kernel_pgt in |
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b51a1e8b0baf..1ff685ca221c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h | |||
@@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void); | |||
91 | 91 | ||
92 | #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) | 92 | #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) |
93 | 93 | ||
94 | #if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) | 94 | #if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) |
95 | 95 | ||
96 | #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ | 96 | #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ |
97 | dma_addr_t ADDR_NAME; | 97 | dma_addr_t ADDR_NAME; |
@@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void); | |||
130 | 130 | ||
131 | /* generic pci stuff */ | 131 | /* generic pci stuff */ |
132 | #include <asm-generic/pci.h> | 132 | #include <asm-generic/pci.h> |
133 | #define PCIBIOS_MAX_MEM_32 0xffffffff | ||
133 | 134 | ||
134 | #ifdef CONFIG_NUMA | 135 | #ifdef CONFIG_NUMA |
135 | /* Returns the node based on pci bus */ | 136 | /* Returns the node based on pci bus */ |
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index e60fd3e14bdf..b399988eee3a 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h | |||
@@ -121,6 +121,9 @@ extern int __init pcibios_init(void); | |||
121 | extern int __init pci_mmcfg_arch_init(void); | 121 | extern int __init pci_mmcfg_arch_init(void); |
122 | extern void __init pci_mmcfg_arch_free(void); | 122 | extern void __init pci_mmcfg_arch_free(void); |
123 | 123 | ||
124 | extern struct acpi_mcfg_allocation *pci_mmcfg_config; | ||
125 | extern int pci_mmcfg_config_num; | ||
126 | |||
124 | /* | 127 | /* |
125 | * AMD Fam10h CPUs are buggy, and cannot access MMIO config space | 128 | * AMD Fam10h CPUs are buggy, and cannot access MMIO config space |
126 | * on their northbrige except through the * %eax register. As such, you MUST | 129 | * on their northbrige except through the * %eax register. As such, you MUST |
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30982a3..103f1ddb0d85 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
@@ -42,6 +42,7 @@ | |||
42 | 42 | ||
43 | #else /* ...!ASSEMBLY */ | 43 | #else /* ...!ASSEMBLY */ |
44 | 44 | ||
45 | #include <linux/kernel.h> | ||
45 | #include <linux/stringify.h> | 46 | #include <linux/stringify.h> |
46 | 47 | ||
47 | #ifdef CONFIG_SMP | 48 | #ifdef CONFIG_SMP |
@@ -155,6 +156,15 @@ do { \ | |||
155 | /* We can use this directly for local CPU (faster). */ | 156 | /* We can use this directly for local CPU (faster). */ |
156 | DECLARE_PER_CPU(unsigned long, this_cpu_off); | 157 | DECLARE_PER_CPU(unsigned long, this_cpu_off); |
157 | 158 | ||
159 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
160 | void *pcpu_lpage_remapped(void *kaddr); | ||
161 | #else | ||
162 | static inline void *pcpu_lpage_remapped(void *kaddr) | ||
163 | { | ||
164 | return NULL; | ||
165 | } | ||
166 | #endif | ||
167 | |||
158 | #endif /* !__ASSEMBLY__ */ | 168 | #endif /* !__ASSEMBLY__ */ |
159 | 169 | ||
160 | #ifdef CONFIG_SMP | 170 | #ifdef CONFIG_SMP |
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 876ed97147b3..fa64e401589d 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h | |||
@@ -84,14 +84,12 @@ union cpuid10_edx { | |||
84 | #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b | 84 | #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b |
85 | #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) | 85 | #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) |
86 | 86 | ||
87 | extern void set_perf_counter_pending(void); | ||
88 | |||
89 | #define clear_perf_counter_pending() do { } while (0) | ||
90 | #define test_perf_counter_pending() (0) | ||
91 | |||
92 | #ifdef CONFIG_PERF_COUNTERS | 87 | #ifdef CONFIG_PERF_COUNTERS |
93 | extern void init_hw_perf_counters(void); | 88 | extern void init_hw_perf_counters(void); |
94 | extern void perf_counters_lapic_init(void); | 89 | extern void perf_counters_lapic_init(void); |
90 | |||
91 | #define PERF_COUNTER_INDEX_OFFSET 0 | ||
92 | |||
95 | #else | 93 | #else |
96 | static inline void init_hw_perf_counters(void) { } | 94 | static inline void init_hw_perf_counters(void) { } |
97 | static inline void perf_counters_lapic_init(void) { } | 95 | static inline void perf_counters_lapic_init(void) { } |
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd14c54ac718..0e8c2a0fd922 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) | |||
46 | __free_page(pte); | 46 | __free_page(pte); |
47 | } | 47 | } |
48 | 48 | ||
49 | extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); | 49 | extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); |
50 | |||
51 | static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, | ||
52 | unsigned long address) | ||
53 | { | ||
54 | ___pte_free_tlb(tlb, pte); | ||
55 | } | ||
50 | 56 | ||
51 | static inline void pmd_populate_kernel(struct mm_struct *mm, | 57 | static inline void pmd_populate_kernel(struct mm_struct *mm, |
52 | pmd_t *pmd, pte_t *pte) | 58 | pmd_t *pmd, pte_t *pte) |
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |||
78 | free_page((unsigned long)pmd); | 84 | free_page((unsigned long)pmd); |
79 | } | 85 | } |
80 | 86 | ||
81 | extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); | 87 | extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); |
88 | |||
89 | static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, | ||
90 | unsigned long adddress) | ||
91 | { | ||
92 | ___pmd_free_tlb(tlb, pmd); | ||
93 | } | ||
82 | 94 | ||
83 | #ifdef CONFIG_X86_PAE | 95 | #ifdef CONFIG_X86_PAE |
84 | extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); | 96 | extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); |
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) | |||
108 | free_page((unsigned long)pud); | 120 | free_page((unsigned long)pud); |
109 | } | 121 | } |
110 | 122 | ||
111 | extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); | 123 | extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); |
124 | |||
125 | static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, | ||
126 | unsigned long address) | ||
127 | { | ||
128 | ___pud_free_tlb(tlb, pud); | ||
129 | } | ||
130 | |||
112 | #endif /* PAGETABLE_LEVELS > 3 */ | 131 | #endif /* PAGETABLE_LEVELS > 3 */ |
113 | #endif /* PAGETABLE_LEVELS > 2 */ | 132 | #endif /* PAGETABLE_LEVELS > 2 */ |
114 | 133 | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3fceb8..16748077559a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _ASM_X86_PGTABLE_H | 2 | #define _ASM_X86_PGTABLE_H |
3 | 3 | ||
4 | #include <asm/page.h> | 4 | #include <asm/page.h> |
5 | #include <asm/e820.h> | ||
5 | 6 | ||
6 | #include <asm/pgtable_types.h> | 7 | #include <asm/pgtable_types.h> |
7 | 8 | ||
@@ -269,10 +270,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
269 | 270 | ||
270 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) | 271 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) |
271 | 272 | ||
272 | static inline int is_new_memtype_allowed(unsigned long flags, | 273 | static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, |
273 | unsigned long new_flags) | 274 | unsigned long flags, |
275 | unsigned long new_flags) | ||
274 | { | 276 | { |
275 | /* | 277 | /* |
278 | * PAT type is always WB for ISA. So no need to check. | ||
279 | */ | ||
280 | if (is_ISA_range(paddr, paddr + size - 1)) | ||
281 | return 1; | ||
282 | |||
283 | /* | ||
276 | * Certain new memtypes are not allowed with certain | 284 | * Certain new memtypes are not allowed with certain |
277 | * requested memtype: | 285 | * requested memtype: |
278 | * - request is uncached, return cannot be write-back | 286 | * - request is uncached, return cannot be write-back |
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 31bd120cf2a2..01fd9461d323 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h | |||
@@ -49,13 +49,17 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); | |||
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | #if defined(CONFIG_HIGHPTE) | 51 | #if defined(CONFIG_HIGHPTE) |
52 | #define __KM_PTE \ | ||
53 | (in_nmi() ? KM_NMI_PTE : \ | ||
54 | in_irq() ? KM_IRQ_PTE : \ | ||
55 | KM_PTE0) | ||
52 | #define pte_offset_map(dir, address) \ | 56 | #define pte_offset_map(dir, address) \ |
53 | ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ | 57 | ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ |
54 | pte_index((address))) | 58 | pte_index((address))) |
55 | #define pte_offset_map_nested(dir, address) \ | 59 | #define pte_offset_map_nested(dir, address) \ |
56 | ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ | 60 | ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ |
57 | pte_index((address))) | 61 | pte_index((address))) |
58 | #define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) | 62 | #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) |
59 | #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) | 63 | #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) |
60 | #else | 64 | #else |
61 | #define pte_offset_map(dir, address) \ | 65 | #define pte_offset_map(dir, address) \ |
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index abde308fdb0f..c57a30117149 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -165,10 +165,7 @@ extern void cleanup_highmap(void); | |||
165 | 165 | ||
166 | /* fs/proc/kcore.c */ | 166 | /* fs/proc/kcore.c */ |
167 | #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) | 167 | #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) |
168 | #define kc_offset_to_vaddr(o) \ | 168 | #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) |
169 | (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \ | ||
170 | ? ((o) | ~__VIRTUAL_MASK) \ | ||
171 | : (o)) | ||
172 | 169 | ||
173 | #define __HAVE_ARCH_PTE_SAME | 170 | #define __HAVE_ARCH_PTE_SAME |
174 | #endif /* !__ASSEMBLY__ */ | 171 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 49fb3ecf3bb3..621f56d73121 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h | |||
@@ -22,7 +22,14 @@ extern int reboot_force; | |||
22 | 22 | ||
23 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); | 23 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); |
24 | 24 | ||
25 | #define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) | 25 | /* |
26 | #define round_down(x, y) ((x) & ~((y) - 1)) | 26 | * This looks more complex than it should be. But we need to |
27 | * get the type for the ~ right in round_down (it needs to be | ||
28 | * as wide as the result!), and we want to evaluate the macro | ||
29 | * arguments just once each. | ||
30 | */ | ||
31 | #define __round_mask(x,y) ((__typeof__(x))((y)-1)) | ||
32 | #define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) | ||
33 | #define round_down(x,y) ((x) & ~__round_mask(x,y)) | ||
27 | 34 | ||
28 | #endif /* _ASM_X86_PROTO_H */ | 35 | #endif /* _ASM_X86_PROTO_H */ |
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b7e5db876399..4e77853321db 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h | |||
@@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) | |||
302 | #define _raw_read_relax(lock) cpu_relax() | 302 | #define _raw_read_relax(lock) cpu_relax() |
303 | #define _raw_write_relax(lock) cpu_relax() | 303 | #define _raw_write_relax(lock) cpu_relax() |
304 | 304 | ||
305 | /* The {read|write|spin}_lock() on x86 are full memory barriers. */ | ||
306 | static inline void smp_mb__after_lock(void) { } | ||
307 | #define ARCH_HAS_SMP_MB_AFTER_LOCK | ||
308 | |||
305 | #endif /* _ASM_X86_SPINLOCK_H */ | 309 | #endif /* _ASM_X86_SPINLOCK_H */ |
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f517944b2b17..cf86a5e73815 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h | |||
@@ -3,6 +3,8 @@ | |||
3 | 3 | ||
4 | extern int kstack_depth_to_print; | 4 | extern int kstack_depth_to_print; |
5 | 5 | ||
6 | int x86_is_stack_id(int id, char *name); | ||
7 | |||
6 | /* Generic stack tracer with callbacks */ | 8 | /* Generic stack tracer with callbacks */ |
7 | 9 | ||
8 | struct stacktrace_ops { | 10 | struct stacktrace_ops { |
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h deleted file mode 100644 index c62349ee7860..000000000000 --- a/arch/x86/include/asm/therm_throt.h +++ /dev/null | |||
@@ -1,9 +0,0 @@ | |||
1 | #ifndef _ASM_X86_THERM_THROT_H | ||
2 | #define _ASM_X86_THERM_THROT_H | ||
3 | |||
4 | #include <asm/atomic.h> | ||
5 | |||
6 | extern atomic_t therm_throt_en; | ||
7 | int therm_throt_process(int curr); | ||
8 | |||
9 | #endif /* _ASM_X86_THERM_THROT_H */ | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b0783520988b..6f7786aea4fc 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -49,7 +49,7 @@ struct thread_info { | |||
49 | .exec_domain = &default_exec_domain, \ | 49 | .exec_domain = &default_exec_domain, \ |
50 | .flags = 0, \ | 50 | .flags = 0, \ |
51 | .cpu = 0, \ | 51 | .cpu = 0, \ |
52 | .preempt_count = 1, \ | 52 | .preempt_count = INIT_PREEMPT_COUNT, \ |
53 | .addr_limit = KERNEL_DS, \ | 53 | .addr_limit = KERNEL_DS, \ |
54 | .restart_block = { \ | 54 | .restart_block = { \ |
55 | .fn = do_no_restart_syscall, \ | 55 | .fn = do_no_restart_syscall, \ |
@@ -95,7 +95,7 @@ struct thread_info { | |||
95 | #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ | 95 | #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ |
96 | #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ | 96 | #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ |
97 | #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ | 97 | #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ |
98 | #define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ | 98 | #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ |
99 | 99 | ||
100 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | 100 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) |
101 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | 101 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) |
@@ -118,17 +118,17 @@ struct thread_info { | |||
118 | #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) | 118 | #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) |
119 | #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) | 119 | #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) |
120 | #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) | 120 | #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) |
121 | #define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) | 121 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) |
122 | 122 | ||
123 | /* work to do in syscall_trace_enter() */ | 123 | /* work to do in syscall_trace_enter() */ |
124 | #define _TIF_WORK_SYSCALL_ENTRY \ | 124 | #define _TIF_WORK_SYSCALL_ENTRY \ |
125 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ | 125 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ |
126 | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) | 126 | _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) |
127 | 127 | ||
128 | /* work to do in syscall_trace_leave() */ | 128 | /* work to do in syscall_trace_leave() */ |
129 | #define _TIF_WORK_SYSCALL_EXIT \ | 129 | #define _TIF_WORK_SYSCALL_EXIT \ |
130 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ | 130 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ |
131 | _TIF_SYSCALL_FTRACE) | 131 | _TIF_SYSCALL_TRACEPOINT) |
132 | 132 | ||
133 | /* work to do on interrupt/exception return */ | 133 | /* work to do on interrupt/exception return */ |
134 | #define _TIF_WORK_MASK \ | 134 | #define _TIF_WORK_MASK \ |
@@ -137,7 +137,8 @@ struct thread_info { | |||
137 | _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) | 137 | _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) |
138 | 138 | ||
139 | /* work to do on any return to user space */ | 139 | /* work to do on any return to user space */ |
140 | #define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) | 140 | #define _TIF_ALLWORK_MASK \ |
141 | ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) | ||
141 | 142 | ||
142 | /* Only used for 64 bit */ | 143 | /* Only used for 64 bit */ |
143 | #define _TIF_DO_NOTIFY_MASK \ | 144 | #define _TIF_DO_NOTIFY_MASK \ |
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index bd37ed444a21..20ca9c4d4686 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h | |||
@@ -45,12 +45,16 @@ extern int no_timer_check; | |||
45 | */ | 45 | */ |
46 | 46 | ||
47 | DECLARE_PER_CPU(unsigned long, cyc2ns); | 47 | DECLARE_PER_CPU(unsigned long, cyc2ns); |
48 | DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); | ||
48 | 49 | ||
49 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | 50 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ |
50 | 51 | ||
51 | static inline unsigned long long __cycles_2_ns(unsigned long long cyc) | 52 | static inline unsigned long long __cycles_2_ns(unsigned long long cyc) |
52 | { | 53 | { |
53 | return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; | 54 | int cpu = smp_processor_id(); |
55 | unsigned long long ns = per_cpu(cyc2ns_offset, cpu); | ||
56 | ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; | ||
57 | return ns; | ||
54 | } | 58 | } |
55 | 59 | ||
56 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | 60 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) |
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index b685ece89d5c..d2c6c930b491 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h | |||
@@ -25,7 +25,7 @@ | |||
25 | #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) | 25 | #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) |
26 | 26 | ||
27 | #define KERNEL_DS MAKE_MM_SEG(-1UL) | 27 | #define KERNEL_DS MAKE_MM_SEG(-1UL) |
28 | #define USER_DS MAKE_MM_SEG(PAGE_OFFSET) | 28 | #define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) |
29 | 29 | ||
30 | #define get_ds() (KERNEL_DS) | 30 | #define get_ds() (KERNEL_DS) |
31 | #define get_fs() (current_thread_info()->addr_limit) | 31 | #define get_fs() (current_thread_info()->addr_limit) |
@@ -212,9 +212,9 @@ extern int __get_user_bad(void); | |||
212 | : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") | 212 | : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") |
213 | #else | 213 | #else |
214 | #define __put_user_asm_u64(x, ptr, retval, errret) \ | 214 | #define __put_user_asm_u64(x, ptr, retval, errret) \ |
215 | __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) | 215 | __put_user_asm(x, ptr, retval, "q", "", "er", errret) |
216 | #define __put_user_asm_ex_u64(x, addr) \ | 216 | #define __put_user_asm_ex_u64(x, addr) \ |
217 | __put_user_asm_ex(x, addr, "q", "", "Zr") | 217 | __put_user_asm_ex(x, addr, "q", "", "er") |
218 | #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) | 218 | #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) |
219 | #endif | 219 | #endif |
220 | 220 | ||
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8cc687326eb8..db24b215fc50 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h | |||
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) | |||
88 | ret, "l", "k", "ir", 4); | 88 | ret, "l", "k", "ir", 4); |
89 | return ret; | 89 | return ret; |
90 | case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, | 90 | case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, |
91 | ret, "q", "", "ir", 8); | 91 | ret, "q", "", "er", 8); |
92 | return ret; | 92 | return ret; |
93 | case 10: | 93 | case 10: |
94 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, | 94 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, |
95 | ret, "q", "", "ir", 10); | 95 | ret, "q", "", "er", 10); |
96 | if (unlikely(ret)) | 96 | if (unlikely(ret)) |
97 | return ret; | 97 | return ret; |
98 | asm("":::"memory"); | 98 | asm("":::"memory"); |
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) | |||
101 | return ret; | 101 | return ret; |
102 | case 16: | 102 | case 16: |
103 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, | 103 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, |
104 | ret, "q", "", "ir", 16); | 104 | ret, "q", "", "er", 16); |
105 | if (unlikely(ret)) | 105 | if (unlikely(ret)) |
106 | return ret; | 106 | return ret; |
107 | asm("":::"memory"); | 107 | asm("":::"memory"); |
108 | __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, | 108 | __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, |
109 | ret, "q", "", "ir", 8); | 109 | ret, "q", "", "er", 8); |
110 | return ret; | 110 | return ret; |
111 | default: | 111 | default: |
112 | return copy_user_generic((__force void *)dst, src, size); | 112 | return copy_user_generic((__force void *)dst, src, size); |
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) | |||
157 | ret, "q", "", "=r", 8); | 157 | ret, "q", "", "=r", 8); |
158 | if (likely(!ret)) | 158 | if (likely(!ret)) |
159 | __put_user_asm(tmp, (u64 __user *)dst, | 159 | __put_user_asm(tmp, (u64 __user *)dst, |
160 | ret, "q", "", "ir", 8); | 160 | ret, "q", "", "er", 8); |
161 | return ret; | 161 | return ret; |
162 | } | 162 | } |
163 | default: | 163 | default: |
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 732a30706153..8deaada61bc8 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h | |||
@@ -345,6 +345,8 @@ | |||
345 | 345 | ||
346 | #ifdef __KERNEL__ | 346 | #ifdef __KERNEL__ |
347 | 347 | ||
348 | #define NR_syscalls 337 | ||
349 | |||
348 | #define __ARCH_WANT_IPC_PARSE_VERSION | 350 | #define __ARCH_WANT_IPC_PARSE_VERSION |
349 | #define __ARCH_WANT_OLD_READDIR | 351 | #define __ARCH_WANT_OLD_READDIR |
350 | #define __ARCH_WANT_OLD_STAT | 352 | #define __ARCH_WANT_OLD_STAT |
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 900e1617e672..b9f3c60de5f7 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h | |||
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) | |||
688 | #endif /* __NO_STUBS */ | 688 | #endif /* __NO_STUBS */ |
689 | 689 | ||
690 | #ifdef __KERNEL__ | 690 | #ifdef __KERNEL__ |
691 | |||
692 | #ifndef COMPILE_OFFSETS | ||
693 | #include <asm/asm-offsets.h> | ||
694 | #define NR_syscalls (__NR_syscall_max + 1) | ||
695 | #endif | ||
696 | |||
691 | /* | 697 | /* |
692 | * "Conditional" syscalls | 698 | * "Conditional" syscalls |
693 | * | 699 | * |
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index bddd44f2f0ab..80e2984f521c 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h | |||
@@ -133,7 +133,7 @@ struct bau_msg_payload { | |||
133 | * see table 4.2.3.0.1 in broacast_assist spec. | 133 | * see table 4.2.3.0.1 in broacast_assist spec. |
134 | */ | 134 | */ |
135 | struct bau_msg_header { | 135 | struct bau_msg_header { |
136 | unsigned int dest_subnodeid:6; /* must be zero */ | 136 | unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ |
137 | /* bits 5:0 */ | 137 | /* bits 5:0 */ |
138 | unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ | 138 | unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ |
139 | /* bits 20:6 */ /* first bit in node_map */ | 139 | /* bits 20:6 */ /* first bit in node_map */ |
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 341070f7ad5c..77a68505419a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h | |||
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); | |||
175 | #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) | 175 | #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) |
176 | 176 | ||
177 | #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ | 177 | #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ |
178 | ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) | 178 | (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) |
179 | 179 | ||
180 | #define UV_APIC_PNODE_SHIFT 6 | 180 | #define UV_APIC_PNODE_SHIFT 6 |
181 | 181 | ||
@@ -327,6 +327,7 @@ struct uv_blade_info { | |||
327 | unsigned short nr_possible_cpus; | 327 | unsigned short nr_possible_cpus; |
328 | unsigned short nr_online_cpus; | 328 | unsigned short nr_online_cpus; |
329 | unsigned short pnode; | 329 | unsigned short pnode; |
330 | short memory_nid; | ||
330 | }; | 331 | }; |
331 | extern struct uv_blade_info *uv_blade_info; | 332 | extern struct uv_blade_info *uv_blade_info; |
332 | extern short *uv_node_to_blade; | 333 | extern short *uv_node_to_blade; |
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid) | |||
363 | return uv_blade_info[bid].pnode; | 364 | return uv_blade_info[bid].pnode; |
364 | } | 365 | } |
365 | 366 | ||
367 | /* Nid of memory node on blade. -1 if no blade-local memory */ | ||
368 | static inline int uv_blade_to_memory_nid(int bid) | ||
369 | { | ||
370 | return uv_blade_info[bid].memory_nid; | ||
371 | } | ||
372 | |||
366 | /* Determine the number of possible cpus on a blade */ | 373 | /* Determine the number of possible cpus on a blade */ |
367 | static inline int uv_blade_nr_possible_cpus(int bid) | 374 | static inline int uv_blade_nr_possible_cpus(int bid) |
368 | { | 375 | { |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b67efd1cf59b..bf04201b6575 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -24,6 +24,10 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) | |||
24 | CFLAGS_hpet.o := $(nostackp) | 24 | CFLAGS_hpet.o := $(nostackp) |
25 | CFLAGS_tsc.o := $(nostackp) | 25 | CFLAGS_tsc.o := $(nostackp) |
26 | CFLAGS_paravirt.o := $(nostackp) | 26 | CFLAGS_paravirt.o := $(nostackp) |
27 | GCOV_PROFILE_vsyscall_64.o := n | ||
28 | GCOV_PROFILE_hpet.o := n | ||
29 | GCOV_PROFILE_tsc.o := n | ||
30 | GCOV_PROFILE_paravirt.o := n | ||
27 | 31 | ||
28 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o | 32 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o |
29 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o | 33 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 631086159c53..6b8ca3a0285d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -44,11 +44,7 @@ | |||
44 | 44 | ||
45 | static int __initdata acpi_force = 0; | 45 | static int __initdata acpi_force = 0; |
46 | u32 acpi_rsdt_forced; | 46 | u32 acpi_rsdt_forced; |
47 | #ifdef CONFIG_ACPI | 47 | int acpi_disabled; |
48 | int acpi_disabled = 0; | ||
49 | #else | ||
50 | int acpi_disabled = 1; | ||
51 | #endif | ||
52 | EXPORT_SYMBOL(acpi_disabled); | 48 | EXPORT_SYMBOL(acpi_disabled); |
53 | 49 | ||
54 | #ifdef CONFIG_X86_64 | 50 | #ifdef CONFIG_X86_64 |
@@ -122,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size) | |||
122 | early_iounmap(map, size); | 118 | early_iounmap(map, size); |
123 | } | 119 | } |
124 | 120 | ||
125 | #ifdef CONFIG_PCI_MMCONFIG | ||
126 | |||
127 | static int acpi_mcfg_64bit_base_addr __initdata = FALSE; | ||
128 | |||
129 | /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ | ||
130 | struct acpi_mcfg_allocation *pci_mmcfg_config; | ||
131 | int pci_mmcfg_config_num; | ||
132 | |||
133 | static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) | ||
134 | { | ||
135 | if (!strcmp(mcfg->header.oem_id, "SGI")) | ||
136 | acpi_mcfg_64bit_base_addr = TRUE; | ||
137 | |||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | int __init acpi_parse_mcfg(struct acpi_table_header *header) | ||
142 | { | ||
143 | struct acpi_table_mcfg *mcfg; | ||
144 | unsigned long i; | ||
145 | int config_size; | ||
146 | |||
147 | if (!header) | ||
148 | return -EINVAL; | ||
149 | |||
150 | mcfg = (struct acpi_table_mcfg *)header; | ||
151 | |||
152 | /* how many config structures do we have */ | ||
153 | pci_mmcfg_config_num = 0; | ||
154 | i = header->length - sizeof(struct acpi_table_mcfg); | ||
155 | while (i >= sizeof(struct acpi_mcfg_allocation)) { | ||
156 | ++pci_mmcfg_config_num; | ||
157 | i -= sizeof(struct acpi_mcfg_allocation); | ||
158 | }; | ||
159 | if (pci_mmcfg_config_num == 0) { | ||
160 | printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); | ||
161 | return -ENODEV; | ||
162 | } | ||
163 | |||
164 | config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); | ||
165 | pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); | ||
166 | if (!pci_mmcfg_config) { | ||
167 | printk(KERN_WARNING PREFIX | ||
168 | "No memory for MCFG config tables\n"); | ||
169 | return -ENOMEM; | ||
170 | } | ||
171 | |||
172 | memcpy(pci_mmcfg_config, &mcfg[1], config_size); | ||
173 | |||
174 | acpi_mcfg_oem_check(mcfg); | ||
175 | |||
176 | for (i = 0; i < pci_mmcfg_config_num; ++i) { | ||
177 | if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && | ||
178 | !acpi_mcfg_64bit_base_addr) { | ||
179 | printk(KERN_ERR PREFIX | ||
180 | "MMCONFIG not in low 4GB of memory\n"); | ||
181 | kfree(pci_mmcfg_config); | ||
182 | pci_mmcfg_config_num = 0; | ||
183 | return -ENODEV; | ||
184 | } | ||
185 | } | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | #endif /* CONFIG_PCI_MMCONFIG */ | ||
190 | |||
191 | #ifdef CONFIG_X86_LOCAL_APIC | 121 | #ifdef CONFIG_X86_LOCAL_APIC |
192 | static int __init acpi_parse_madt(struct acpi_table_header *table) | 122 | static int __init acpi_parse_madt(struct acpi_table_header *table) |
193 | { | 123 | { |
@@ -1519,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { | |||
1519 | }, | 1449 | }, |
1520 | { | 1450 | { |
1521 | .callback = force_acpi_ht, | 1451 | .callback = force_acpi_ht, |
1522 | .ident = "ASUS P4B266", | ||
1523 | .matches = { | ||
1524 | DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), | ||
1525 | DMI_MATCH(DMI_BOARD_NAME, "P4B266"), | ||
1526 | }, | ||
1527 | }, | ||
1528 | { | ||
1529 | .callback = force_acpi_ht, | ||
1530 | .ident = "ASUS P2B-DS", | 1452 | .ident = "ASUS P2B-DS", |
1531 | .matches = { | 1453 | .matches = { |
1532 | DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), | 1454 | DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bbbe4bbb6f34..8c44c232efcb 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, | |||
34 | flags->bm_check = 1; | 34 | flags->bm_check = 1; |
35 | else if (c->x86_vendor == X86_VENDOR_INTEL) { | 35 | else if (c->x86_vendor == X86_VENDOR_INTEL) { |
36 | /* | 36 | /* |
37 | * Today all CPUs that support C3 share cache. | 37 | * Today all MP CPUs that support C3 share cache. |
38 | * TBD: This needs to look at cache shared map, once | 38 | * And caches should not be flushed by software while |
39 | * multi-core detection patch makes to the base. | 39 | * entering C3 type state. |
40 | */ | 40 | */ |
41 | flags->bm_check = 1; | 41 | flags->bm_check = 1; |
42 | } | 42 | } |
43 | |||
44 | /* | ||
45 | * On all recent Intel platforms, ARB_DISABLE is a nop. | ||
46 | * So, set bm_control to zero to indicate that ARB_DISABLE | ||
47 | * is not required while entering C3 type state on | ||
48 | * P4, Core and beyond CPUs | ||
49 | */ | ||
50 | if (c->x86_vendor == X86_VENDOR_INTEL && | ||
51 | (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) | ||
52 | flags->bm_control = 0; | ||
43 | } | 53 | } |
44 | EXPORT_SYMBOL(acpi_processor_power_init_bm_check); | 54 | EXPORT_SYMBOL(acpi_processor_power_init_bm_check); |
45 | 55 | ||
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index 7c074eec39fb..d296f4a195c9 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c | |||
@@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) | |||
72 | return; | 72 | return; |
73 | } | 73 | } |
74 | 74 | ||
75 | |||
75 | /* Initialize _PDC data based on the CPU vendor */ | 76 | /* Initialize _PDC data based on the CPU vendor */ |
76 | void arch_acpi_processor_init_pdc(struct acpi_processor *pr) | 77 | void arch_acpi_processor_init_pdc(struct acpi_processor *pr) |
77 | { | 78 | { |
@@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) | |||
85 | } | 86 | } |
86 | 87 | ||
87 | EXPORT_SYMBOL(arch_acpi_processor_init_pdc); | 88 | EXPORT_SYMBOL(arch_acpi_processor_init_pdc); |
89 | |||
90 | void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) | ||
91 | { | ||
92 | if (pr->pdc) { | ||
93 | kfree(pr->pdc->pointer->buffer.pointer); | ||
94 | kfree(pr->pdc->pointer); | ||
95 | kfree(pr->pdc); | ||
96 | pr->pdc = NULL; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); | ||
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 167bc16ce0e5..6a564ac67ef5 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile | |||
@@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \ | |||
42 | $(call cc-option, -mpreferred-stack-boundary=2) | 42 | $(call cc-option, -mpreferred-stack-boundary=2) |
43 | KBUILD_CFLAGS += $(call cc-option, -m32) | 43 | KBUILD_CFLAGS += $(call cc-option, -m32) |
44 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ | 44 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ |
45 | GCOV_PROFILE := n | ||
45 | 46 | ||
46 | WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) | 47 | WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) |
47 | 48 | ||
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 1c60554537c3..6c99f5037801 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -434,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) | |||
434 | iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); | 434 | iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); |
435 | } | 435 | } |
436 | 436 | ||
437 | /* Flush the whole IO/TLB for a given protection domain - including PDE */ | ||
438 | static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) | ||
439 | { | ||
440 | u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; | ||
441 | |||
442 | INC_STATS_COUNTER(domain_flush_single); | ||
443 | |||
444 | iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); | ||
445 | } | ||
446 | |||
437 | /* | 447 | /* |
438 | * This function is used to flush the IO/TLB for a given protection domain | 448 | * This function is used to flush the IO/TLB for a given protection domain |
439 | * on every IOMMU in the system | 449 | * on every IOMMU in the system |
@@ -1078,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu, | |||
1078 | amd_iommu_pd_table[devid] = domain; | 1088 | amd_iommu_pd_table[devid] = domain; |
1079 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | 1089 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); |
1080 | 1090 | ||
1091 | /* | ||
1092 | * We might boot into a crash-kernel here. The crashed kernel | ||
1093 | * left the caches in the IOMMU dirty. So we have to flush | ||
1094 | * here to evict all dirty stuff. | ||
1095 | */ | ||
1081 | iommu_queue_inv_dev_entry(iommu, devid); | 1096 | iommu_queue_inv_dev_entry(iommu, devid); |
1097 | iommu_flush_tlb_pde(iommu, domain->id); | ||
1082 | } | 1098 | } |
1083 | 1099 | ||
1084 | /* | 1100 | /* |
@@ -1176,7 +1192,7 @@ out: | |||
1176 | return 0; | 1192 | return 0; |
1177 | } | 1193 | } |
1178 | 1194 | ||
1179 | struct notifier_block device_nb = { | 1195 | static struct notifier_block device_nb = { |
1180 | .notifier_call = device_change_notifier, | 1196 | .notifier_call = device_change_notifier, |
1181 | }; | 1197 | }; |
1182 | 1198 | ||
@@ -1747,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size, | |||
1747 | flag |= __GFP_ZERO; | 1763 | flag |= __GFP_ZERO; |
1748 | virt_addr = (void *)__get_free_pages(flag, get_order(size)); | 1764 | virt_addr = (void *)__get_free_pages(flag, get_order(size)); |
1749 | if (!virt_addr) | 1765 | if (!virt_addr) |
1750 | return 0; | 1766 | return NULL; |
1751 | 1767 | ||
1752 | paddr = virt_to_phys(virt_addr); | 1768 | paddr = virt_to_phys(virt_addr); |
1753 | 1769 | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 238989ec077d..c1b17e97252e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -260,6 +260,14 @@ static void iommu_enable(struct amd_iommu *iommu) | |||
260 | 260 | ||
261 | static void iommu_disable(struct amd_iommu *iommu) | 261 | static void iommu_disable(struct amd_iommu *iommu) |
262 | { | 262 | { |
263 | /* Disable command buffer */ | ||
264 | iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); | ||
265 | |||
266 | /* Disable event logging and event interrupts */ | ||
267 | iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); | ||
268 | iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); | ||
269 | |||
270 | /* Disable IOMMU hardware itself */ | ||
263 | iommu_feature_disable(iommu, CONTROL_IOMMU_EN); | 271 | iommu_feature_disable(iommu, CONTROL_IOMMU_EN); |
264 | } | 272 | } |
265 | 273 | ||
@@ -464,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) | |||
464 | if (iommu->evt_buf == NULL) | 472 | if (iommu->evt_buf == NULL) |
465 | return NULL; | 473 | return NULL; |
466 | 474 | ||
475 | iommu->evt_buf_size = EVT_BUFFER_SIZE; | ||
476 | |||
467 | return iommu->evt_buf; | 477 | return iommu->evt_buf; |
468 | } | 478 | } |
469 | 479 | ||
@@ -478,6 +488,10 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu) | |||
478 | memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, | 488 | memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, |
479 | &entry, sizeof(entry)); | 489 | &entry, sizeof(entry)); |
480 | 490 | ||
491 | /* set head and tail to zero manually */ | ||
492 | writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); | ||
493 | writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); | ||
494 | |||
481 | iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); | 495 | iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); |
482 | } | 496 | } |
483 | 497 | ||
@@ -679,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | |||
679 | 693 | ||
680 | devid = e->devid; | 694 | devid = e->devid; |
681 | devid_to = e->ext >> 8; | 695 | devid_to = e->ext >> 8; |
696 | set_dev_entry_from_acpi(iommu, devid , e->flags, 0); | ||
682 | set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); | 697 | set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); |
683 | amd_iommu_alias_table[devid] = devid_to; | 698 | amd_iommu_alias_table[devid] = devid_to; |
684 | break; | 699 | break; |
@@ -737,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | |||
737 | 752 | ||
738 | devid = e->devid; | 753 | devid = e->devid; |
739 | for (dev_i = devid_start; dev_i <= devid; ++dev_i) { | 754 | for (dev_i = devid_start; dev_i <= devid; ++dev_i) { |
740 | if (alias) | 755 | if (alias) { |
741 | amd_iommu_alias_table[dev_i] = devid_to; | 756 | amd_iommu_alias_table[dev_i] = devid_to; |
742 | set_dev_entry_from_acpi(iommu, | 757 | set_dev_entry_from_acpi(iommu, |
743 | amd_iommu_alias_table[dev_i], | 758 | devid_to, flags, ext_flags); |
744 | flags, ext_flags); | 759 | } |
760 | set_dev_entry_from_acpi(iommu, dev_i, | ||
761 | flags, ext_flags); | ||
745 | } | 762 | } |
746 | break; | 763 | break; |
747 | default: | 764 | default: |
@@ -1042,6 +1059,7 @@ static void enable_iommus(void) | |||
1042 | struct amd_iommu *iommu; | 1059 | struct amd_iommu *iommu; |
1043 | 1060 | ||
1044 | for_each_iommu(iommu) { | 1061 | for_each_iommu(iommu) { |
1062 | iommu_disable(iommu); | ||
1045 | iommu_set_device_table(iommu); | 1063 | iommu_set_device_table(iommu); |
1046 | iommu_enable_command_buffer(iommu); | 1064 | iommu_enable_command_buffer(iommu); |
1047 | iommu_enable_event_buffer(iommu); | 1065 | iommu_enable_event_buffer(iommu); |
@@ -1066,12 +1084,6 @@ static void disable_iommus(void) | |||
1066 | 1084 | ||
1067 | static int amd_iommu_resume(struct sys_device *dev) | 1085 | static int amd_iommu_resume(struct sys_device *dev) |
1068 | { | 1086 | { |
1069 | /* | ||
1070 | * Disable IOMMUs before reprogramming the hardware registers. | ||
1071 | * IOMMU is still enabled from the resume kernel. | ||
1072 | */ | ||
1073 | disable_iommus(); | ||
1074 | |||
1075 | /* re-load the hardware */ | 1087 | /* re-load the hardware */ |
1076 | enable_iommus(); | 1088 | enable_iommus(); |
1077 | 1089 | ||
@@ -1079,8 +1091,8 @@ static int amd_iommu_resume(struct sys_device *dev) | |||
1079 | * we have to flush after the IOMMUs are enabled because a | 1091 | * we have to flush after the IOMMUs are enabled because a |
1080 | * disabled IOMMU will never execute the commands we send | 1092 | * disabled IOMMU will never execute the commands we send |
1081 | */ | 1093 | */ |
1082 | amd_iommu_flush_all_domains(); | ||
1083 | amd_iommu_flush_all_devices(); | 1094 | amd_iommu_flush_all_devices(); |
1095 | amd_iommu_flush_all_domains(); | ||
1084 | 1096 | ||
1085 | return 0; | 1097 | return 0; |
1086 | } | 1098 | } |
@@ -1273,6 +1285,11 @@ free: | |||
1273 | goto out; | 1285 | goto out; |
1274 | } | 1286 | } |
1275 | 1287 | ||
1288 | void amd_iommu_shutdown(void) | ||
1289 | { | ||
1290 | disable_iommus(); | ||
1291 | } | ||
1292 | |||
1276 | /**************************************************************************** | 1293 | /**************************************************************************** |
1277 | * | 1294 | * |
1278 | * Early detect code. This code runs at IOMMU detection time in the DMA | 1295 | * Early detect code. This code runs at IOMMU detection time in the DMA |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8c7c042ecad1..0a1c2830ec66 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -140,7 +140,6 @@ int x2apic_mode; | |||
140 | #ifdef CONFIG_X86_X2APIC | 140 | #ifdef CONFIG_X86_X2APIC |
141 | /* x2apic enabled before OS handover */ | 141 | /* x2apic enabled before OS handover */ |
142 | static int x2apic_preenabled; | 142 | static int x2apic_preenabled; |
143 | static int disable_x2apic; | ||
144 | static __init int setup_nox2apic(char *str) | 143 | static __init int setup_nox2apic(char *str) |
145 | { | 144 | { |
146 | if (x2apic_enabled()) { | 145 | if (x2apic_enabled()) { |
@@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str) | |||
149 | return 0; | 148 | return 0; |
150 | } | 149 | } |
151 | 150 | ||
152 | disable_x2apic = 1; | ||
153 | setup_clear_cpu_cap(X86_FEATURE_X2APIC); | 151 | setup_clear_cpu_cap(X86_FEATURE_X2APIC); |
154 | return 0; | 152 | return 0; |
155 | } | 153 | } |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 69328ac8de9c..8952a5890281 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, | |||
652 | return ret && es7000_apic_is_cluster(); | 652 | return ret && es7000_apic_is_cluster(); |
653 | } | 653 | } |
654 | 654 | ||
655 | struct apic apic_es7000_cluster = { | 655 | /* We've been warned by a false positive warning.Use __refdata to keep calm. */ |
656 | struct apic __refdata apic_es7000_cluster = { | ||
656 | 657 | ||
657 | .name = "es7000", | 658 | .name = "es7000", |
658 | .probe = probe_es7000, | 659 | .probe = probe_es7000, |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ef8d9290c7ea..d2ed6c5ddc80 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -462,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) | |||
462 | static void | 462 | static void |
463 | __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | 463 | __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) |
464 | { | 464 | { |
465 | union entry_union eu; | 465 | union entry_union eu = {{0, 0}}; |
466 | |||
466 | eu.entry = e; | 467 | eu.entry = e; |
467 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | 468 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); |
468 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | 469 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); |
@@ -1413,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq, | |||
1413 | irte.vector = vector; | 1414 | irte.vector = vector; |
1414 | irte.dest_id = IRTE_DEST(destination); | 1415 | irte.dest_id = IRTE_DEST(destination); |
1415 | 1416 | ||
1417 | /* Set source-id of interrupt request */ | ||
1418 | set_ioapic_sid(&irte, apic_id); | ||
1419 | |||
1416 | modify_irte(irq, &irte); | 1420 | modify_irte(irq, &irte); |
1417 | 1421 | ||
1418 | ir_entry->index2 = (index >> 15) & 0x1; | 1422 | ir_entry->index2 = (index >> 15) & 0x1; |
@@ -1712,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1712 | return; | 1716 | return; |
1713 | } | 1717 | } |
1714 | 1718 | ||
1715 | __apicdebuginit(void) print_APIC_bitfield(int base) | 1719 | __apicdebuginit(void) print_APIC_field(int base) |
1716 | { | 1720 | { |
1717 | unsigned int v; | 1721 | int i; |
1718 | int i, j; | ||
1719 | 1722 | ||
1720 | if (apic_verbosity == APIC_QUIET) | 1723 | if (apic_verbosity == APIC_QUIET) |
1721 | return; | 1724 | return; |
1722 | 1725 | ||
1723 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | 1726 | printk(KERN_DEBUG); |
1724 | for (i = 0; i < 8; i++) { | 1727 | |
1725 | v = apic_read(base + i*0x10); | 1728 | for (i = 0; i < 8; i++) |
1726 | for (j = 0; j < 32; j++) { | 1729 | printk(KERN_CONT "%08x", apic_read(base + i*0x10)); |
1727 | if (v & (1<<j)) | 1730 | |
1728 | printk("1"); | 1731 | printk(KERN_CONT "\n"); |
1729 | else | ||
1730 | printk("0"); | ||
1731 | } | ||
1732 | printk("\n"); | ||
1733 | } | ||
1734 | } | 1732 | } |
1735 | 1733 | ||
1736 | __apicdebuginit(void) print_local_APIC(void *dummy) | 1734 | __apicdebuginit(void) print_local_APIC(void *dummy) |
@@ -1741,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) | |||
1741 | if (apic_verbosity == APIC_QUIET) | 1739 | if (apic_verbosity == APIC_QUIET) |
1742 | return; | 1740 | return; |
1743 | 1741 | ||
1744 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | 1742 | printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", |
1745 | smp_processor_id(), hard_smp_processor_id()); | 1743 | smp_processor_id(), hard_smp_processor_id()); |
1746 | v = apic_read(APIC_ID); | 1744 | v = apic_read(APIC_ID); |
1747 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); | 1745 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); |
@@ -1782,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy) | |||
1782 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | 1780 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); |
1783 | 1781 | ||
1784 | printk(KERN_DEBUG "... APIC ISR field:\n"); | 1782 | printk(KERN_DEBUG "... APIC ISR field:\n"); |
1785 | print_APIC_bitfield(APIC_ISR); | 1783 | print_APIC_field(APIC_ISR); |
1786 | printk(KERN_DEBUG "... APIC TMR field:\n"); | 1784 | printk(KERN_DEBUG "... APIC TMR field:\n"); |
1787 | print_APIC_bitfield(APIC_TMR); | 1785 | print_APIC_field(APIC_TMR); |
1788 | printk(KERN_DEBUG "... APIC IRR field:\n"); | 1786 | printk(KERN_DEBUG "... APIC IRR field:\n"); |
1789 | print_APIC_bitfield(APIC_IRR); | 1787 | print_APIC_field(APIC_IRR); |
1790 | 1788 | ||
1791 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | 1789 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ |
1792 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | 1790 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
@@ -2003,7 +2001,9 @@ void disable_IO_APIC(void) | |||
2003 | /* | 2001 | /* |
2004 | * Use virtual wire A mode when interrupt remapping is enabled. | 2002 | * Use virtual wire A mode when interrupt remapping is enabled. |
2005 | */ | 2003 | */ |
2006 | disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); | 2004 | if (cpu_has_apic) |
2005 | disconnect_bsp_APIC(!intr_remapping_enabled && | ||
2006 | ioapic_i8259.pin != -1); | ||
2007 | } | 2007 | } |
2008 | 2008 | ||
2009 | #ifdef CONFIG_X86_32 | 2009 | #ifdef CONFIG_X86_32 |
@@ -3287,6 +3287,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
3287 | irte.vector = cfg->vector; | 3287 | irte.vector = cfg->vector; |
3288 | irte.dest_id = IRTE_DEST(dest); | 3288 | irte.dest_id = IRTE_DEST(dest); |
3289 | 3289 | ||
3290 | /* Set source-id of interrupt request */ | ||
3291 | set_msi_sid(&irte, pdev); | ||
3292 | |||
3290 | modify_irte(irq, &irte); | 3293 | modify_irte(irq, &irte); |
3291 | 3294 | ||
3292 | msg->address_hi = MSI_ADDR_BASE_HI; | 3295 | msg->address_hi = MSI_ADDR_BASE_HI; |
@@ -3567,7 +3570,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
3567 | 3570 | ||
3568 | #endif /* CONFIG_SMP */ | 3571 | #endif /* CONFIG_SMP */ |
3569 | 3572 | ||
3570 | struct irq_chip dmar_msi_type = { | 3573 | static struct irq_chip dmar_msi_type = { |
3571 | .name = "DMAR_MSI", | 3574 | .name = "DMAR_MSI", |
3572 | .unmask = dmar_msi_unmask, | 3575 | .unmask = dmar_msi_unmask, |
3573 | .mask = dmar_msi_mask, | 3576 | .mask = dmar_msi_mask, |
@@ -3790,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | |||
3790 | mmr_pnode = uv_blade_to_pnode(mmr_blade); | 3793 | mmr_pnode = uv_blade_to_pnode(mmr_blade); |
3791 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); | 3794 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); |
3792 | 3795 | ||
3796 | if (cfg->move_in_progress) | ||
3797 | send_cleanup_vector(cfg); | ||
3798 | |||
3793 | return irq; | 3799 | return irq; |
3794 | } | 3800 | } |
3795 | 3801 | ||
@@ -4178,28 +4184,20 @@ fake_ioapic_page: | |||
4178 | } | 4184 | } |
4179 | } | 4185 | } |
4180 | 4186 | ||
4181 | static int __init ioapic_insert_resources(void) | 4187 | void __init ioapic_insert_resources(void) |
4182 | { | 4188 | { |
4183 | int i; | 4189 | int i; |
4184 | struct resource *r = ioapic_resources; | 4190 | struct resource *r = ioapic_resources; |
4185 | 4191 | ||
4186 | if (!r) { | 4192 | if (!r) { |
4187 | if (nr_ioapics > 0) { | 4193 | if (nr_ioapics > 0) |
4188 | printk(KERN_ERR | 4194 | printk(KERN_ERR |
4189 | "IO APIC resources couldn't be allocated.\n"); | 4195 | "IO APIC resources couldn't be allocated.\n"); |
4190 | return -1; | 4196 | return; |
4191 | } | ||
4192 | return 0; | ||
4193 | } | 4197 | } |
4194 | 4198 | ||
4195 | for (i = 0; i < nr_ioapics; i++) { | 4199 | for (i = 0; i < nr_ioapics; i++) { |
4196 | insert_resource(&iomem_resource, r); | 4200 | insert_resource(&iomem_resource, r); |
4197 | r++; | 4201 | r++; |
4198 | } | 4202 | } |
4199 | |||
4200 | return 0; | ||
4201 | } | 4203 | } |
4202 | |||
4203 | /* Insert the IO APIC resources after PCI initialization has occured to handle | ||
4204 | * IO APICS that are mapped in on a BAR in PCI space. */ | ||
4205 | late_initcall(ioapic_insert_resources); | ||
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a9..6ef00ba4c886 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c | |||
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) | |||
106 | unsigned long mask = cpumask_bits(cpumask)[0]; | 106 | unsigned long mask = cpumask_bits(cpumask)[0]; |
107 | unsigned long flags; | 107 | unsigned long flags; |
108 | 108 | ||
109 | if (WARN_ONCE(!mask, "empty IPI mask")) | ||
110 | return; | ||
111 | |||
109 | local_irq_save(flags); | 112 | local_irq_save(flags); |
110 | WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); | 113 | WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); |
111 | __default_send_IPI_dest_field(mask, vector, apic->dest_logical); | 114 | __default_send_IPI_dest_field(mask, vector, apic->dest_logical); |
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 533e59c6fc82..ca96e68f0d23 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c | |||
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void) | |||
493 | (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); | 493 | (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); |
494 | } | 494 | } |
495 | 495 | ||
496 | struct apic apic_numaq = { | 496 | /* Use __refdata to keep false positive warning calm. */ |
497 | struct apic __refdata apic_numaq = { | ||
497 | 498 | ||
498 | .name = "NUMAQ", | 499 | .name = "NUMAQ", |
499 | .probe = probe_numaq, | 500 | .probe = probe_numaq, |
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 440a8bccd91a..0c0182cc947d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c | |||
@@ -20,23 +20,12 @@ | |||
20 | #include <asm/apic.h> | 20 | #include <asm/apic.h> |
21 | #include <asm/setup.h> | 21 | #include <asm/setup.h> |
22 | 22 | ||
23 | #include <linux/threads.h> | ||
24 | #include <linux/cpumask.h> | ||
25 | #include <asm/mpspec.h> | ||
26 | #include <asm/fixmap.h> | ||
27 | #include <asm/apicdef.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/string.h> | ||
30 | #include <linux/smp.h> | 23 | #include <linux/smp.h> |
31 | #include <linux/init.h> | ||
32 | #include <asm/ipi.h> | 24 | #include <asm/ipi.h> |
33 | 25 | ||
34 | #include <linux/smp.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
37 | #include <asm/acpi.h> | 27 | #include <asm/acpi.h> |
38 | #include <asm/e820.h> | 28 | #include <asm/e820.h> |
39 | #include <asm/setup.h> | ||
40 | 29 | ||
41 | #ifdef CONFIG_HOTPLUG_CPU | 30 | #ifdef CONFIG_HOTPLUG_CPU |
42 | #define DEFAULT_SEND_IPI (1) | 31 | #define DEFAULT_SEND_IPI (1) |
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b82..fcec2f1d34a1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c | |||
@@ -44,6 +44,11 @@ static struct apic *apic_probe[] __initdata = { | |||
44 | NULL, | 44 | NULL, |
45 | }; | 45 | }; |
46 | 46 | ||
47 | static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) | ||
48 | { | ||
49 | return hard_smp_processor_id() >> index_msb; | ||
50 | } | ||
51 | |||
47 | /* | 52 | /* |
48 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | 53 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. |
49 | */ | 54 | */ |
@@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void) | |||
69 | printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); | 74 | printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); |
70 | } | 75 | } |
71 | 76 | ||
77 | if (is_vsmp_box()) { | ||
78 | /* need to update phys_pkg_id */ | ||
79 | apic->phys_pkg_id = apicid_phys_pkg_id; | ||
80 | } | ||
81 | |||
72 | /* | 82 | /* |
73 | * Now that apic routing model is selected, configure the | 83 | * Now that apic routing model is selected, configure the |
74 | * fault handling for intr remapping. | 84 | * fault handling for intr remapping. |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 344eee4ac0a4..eafdfbd1ea95 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -44,7 +44,6 @@ | |||
44 | #include <asm/ipi.h> | 44 | #include <asm/ipi.h> |
45 | #include <linux/kernel.h> | 45 | #include <linux/kernel.h> |
46 | #include <linux/string.h> | 46 | #include <linux/string.h> |
47 | #include <linux/init.h> | ||
48 | #include <linux/gfp.h> | 47 | #include <linux/gfp.h> |
49 | #include <linux/smp.h> | 48 | #include <linux/smp.h> |
50 | 49 | ||
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8e4cbb255c38..a5371ec36776 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
17 | return x2apic_enabled(); | 17 | return x2apic_enabled(); |
18 | } | 18 | } |
19 | 19 | ||
20 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | 20 | /* |
21 | 21 | * need to use more than cpu 0, because we need more vectors when | |
22 | * MSI-X are used. | ||
23 | */ | ||
22 | static const struct cpumask *x2apic_target_cpus(void) | 24 | static const struct cpumask *x2apic_target_cpus(void) |
23 | { | 25 | { |
24 | return cpumask_of(0); | 26 | return cpu_online_mask; |
25 | } | 27 | } |
26 | 28 | ||
27 | /* | 29 | /* |
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id) | |||
170 | 172 | ||
171 | static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) | 173 | static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) |
172 | { | 174 | { |
173 | return current_cpu_data.initial_apicid >> index_msb; | 175 | return initial_apicid >> index_msb; |
174 | } | 176 | } |
175 | 177 | ||
176 | static void x2apic_send_IPI_self(int vector) | 178 | static void x2apic_send_IPI_self(int vector) |
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359627e7..a8989aadc99a 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c | |||
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
27 | return 0; | 27 | return 0; |
28 | } | 28 | } |
29 | 29 | ||
30 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | 30 | /* |
31 | 31 | * need to use more than cpu 0, because we need more vectors when | |
32 | * MSI-X are used. | ||
33 | */ | ||
32 | static const struct cpumask *x2apic_target_cpus(void) | 34 | static const struct cpumask *x2apic_target_cpus(void) |
33 | { | 35 | { |
34 | return cpumask_of(0); | 36 | return cpu_online_mask; |
35 | } | 37 | } |
36 | 38 | ||
37 | static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) | 39 | static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) |
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id) | |||
162 | 164 | ||
163 | static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) | 165 | static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) |
164 | { | 166 | { |
165 | return current_cpu_data.initial_apicid >> index_msb; | 167 | return initial_apicid >> index_msb; |
166 | } | 168 | } |
167 | 169 | ||
168 | static void x2apic_send_IPI_self(int vector) | 170 | static void x2apic_send_IPI_self(int vector) |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 096d19aea2f7..601159374e87 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -46,7 +46,7 @@ static int early_get_nodeid(void) | |||
46 | return node_id.s.node_id; | 46 | return node_id.s.node_id; |
47 | } | 47 | } |
48 | 48 | ||
49 | static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 49 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
50 | { | 50 | { |
51 | if (!strcmp(oem_id, "SGI")) { | 51 | if (!strcmp(oem_id, "SGI")) { |
52 | if (!strcmp(oem_table_id, "UVL")) | 52 | if (!strcmp(oem_table_id, "UVL")) |
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) | |||
253 | apic_write(APIC_SELF_IPI, vector); | 253 | apic_write(APIC_SELF_IPI, vector); |
254 | } | 254 | } |
255 | 255 | ||
256 | struct apic apic_x2apic_uv_x = { | 256 | struct apic __refdata apic_x2apic_uv_x = { |
257 | 257 | ||
258 | .name = "UV large system", | 258 | .name = "UV large system", |
259 | .probe = NULL, | 259 | .probe = NULL, |
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { | |||
261 | .apic_id_registered = uv_apic_id_registered, | 261 | .apic_id_registered = uv_apic_id_registered, |
262 | 262 | ||
263 | .irq_delivery_mode = dest_Fixed, | 263 | .irq_delivery_mode = dest_Fixed, |
264 | .irq_dest_mode = 1, /* logical */ | 264 | .irq_dest_mode = 0, /* physical */ |
265 | 265 | ||
266 | .target_cpus = uv_target_cpus, | 266 | .target_cpus = uv_target_cpus, |
267 | .disable_esr = 0, | 267 | .disable_esr = 0, |
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) | |||
362 | BUG(); | 362 | BUG(); |
363 | } | 363 | } |
364 | 364 | ||
365 | static __init void map_low_mmrs(void) | ||
366 | { | ||
367 | init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); | ||
368 | init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); | ||
369 | } | ||
370 | |||
371 | enum map_type {map_wb, map_uc}; | 365 | enum map_type {map_wb, map_uc}; |
372 | 366 | ||
373 | static __init void map_high(char *id, unsigned long base, int shift, | 367 | static __init void map_high(char *id, unsigned long base, int shift, |
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode) | |||
395 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); | 389 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); |
396 | } | 390 | } |
397 | 391 | ||
398 | static __init void map_config_high(int max_pnode) | ||
399 | { | ||
400 | union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; | ||
401 | int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
402 | |||
403 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); | ||
404 | if (cfg.s.enable) | ||
405 | map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); | ||
406 | } | ||
407 | |||
408 | static __init void map_mmr_high(int max_pnode) | ||
409 | { | ||
410 | union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; | ||
411 | int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
412 | |||
413 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); | ||
414 | if (mmr.s.enable) | ||
415 | map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); | ||
416 | } | ||
417 | |||
418 | static __init void map_mmioh_high(int max_pnode) | 392 | static __init void map_mmioh_high(int max_pnode) |
419 | { | 393 | { |
420 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; | 394 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; |
@@ -566,8 +540,6 @@ void __init uv_system_init(void) | |||
566 | unsigned long mmr_base, present, paddr; | 540 | unsigned long mmr_base, present, paddr; |
567 | unsigned short pnode_mask; | 541 | unsigned short pnode_mask; |
568 | 542 | ||
569 | map_low_mmrs(); | ||
570 | |||
571 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); | 543 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); |
572 | m_val = m_n_config.s.m_skt; | 544 | m_val = m_n_config.s.m_skt; |
573 | n_val = m_n_config.s.n_skt; | 545 | n_val = m_n_config.s.n_skt; |
@@ -591,6 +563,8 @@ void __init uv_system_init(void) | |||
591 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); | 563 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); |
592 | uv_blade_info = kmalloc(bytes, GFP_KERNEL); | 564 | uv_blade_info = kmalloc(bytes, GFP_KERNEL); |
593 | BUG_ON(!uv_blade_info); | 565 | BUG_ON(!uv_blade_info); |
566 | for (blade = 0; blade < uv_num_possible_blades(); blade++) | ||
567 | uv_blade_info[blade].memory_nid = -1; | ||
594 | 568 | ||
595 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); | 569 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); |
596 | 570 | ||
@@ -629,6 +603,9 @@ void __init uv_system_init(void) | |||
629 | lcpu = uv_blade_info[blade].nr_possible_cpus; | 603 | lcpu = uv_blade_info[blade].nr_possible_cpus; |
630 | uv_blade_info[blade].nr_possible_cpus++; | 604 | uv_blade_info[blade].nr_possible_cpus++; |
631 | 605 | ||
606 | /* Any node on the blade, else will contain -1. */ | ||
607 | uv_blade_info[blade].memory_nid = nid; | ||
608 | |||
632 | uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; | 609 | uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; |
633 | uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; | 610 | uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; |
634 | uv_cpu_hub_info(cpu)->m_val = m_val; | 611 | uv_cpu_hub_info(cpu)->m_val = m_val; |
@@ -662,11 +639,10 @@ void __init uv_system_init(void) | |||
662 | pnode = (paddr >> m_val) & pnode_mask; | 639 | pnode = (paddr >> m_val) & pnode_mask; |
663 | blade = boot_pnode_to_blade(pnode); | 640 | blade = boot_pnode_to_blade(pnode); |
664 | uv_node_to_blade[nid] = blade; | 641 | uv_node_to_blade[nid] = blade; |
642 | max_pnode = max(pnode, max_pnode); | ||
665 | } | 643 | } |
666 | 644 | ||
667 | map_gru_high(max_pnode); | 645 | map_gru_high(max_pnode); |
668 | map_mmr_high(max_pnode); | ||
669 | map_config_high(max_pnode); | ||
670 | map_mmioh_high(max_pnode); | 646 | map_mmioh_high(max_pnode); |
671 | 647 | ||
672 | uv_cpu_init(); | 648 | uv_cpu_init(); |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9a33a4..442b5508893f 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -811,7 +811,7 @@ static int apm_do_idle(void) | |||
811 | u8 ret = 0; | 811 | u8 ret = 0; |
812 | int idled = 0; | 812 | int idled = 0; |
813 | int polling; | 813 | int polling; |
814 | int err; | 814 | int err = 0; |
815 | 815 | ||
816 | polling = !!(current_thread_info()->status & TS_POLLING); | 816 | polling = !!(current_thread_info()->status & TS_POLLING); |
817 | if (polling) { | 817 | if (polling) { |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc47e129..4a6aeedcd965 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * This code generates raw asm output which is post-processed to extract | 3 | * This code generates raw asm output which is post-processed to extract |
4 | * and format the required data. | 4 | * and format the required data. |
5 | */ | 5 | */ |
6 | #define COMPILE_OFFSETS | ||
6 | 7 | ||
7 | #include <linux/crypto.h> | 8 | #include <linux/crypto.h> |
8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b96a15..c1f253dac155 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER | |||
7 | CFLAGS_REMOVE_common.o = -pg | 7 | CFLAGS_REMOVE_common.o = -pg |
8 | endif | 8 | endif |
9 | 9 | ||
10 | # Make sure load_percpu_segment has no stackprotector | ||
11 | nostackp := $(call cc-option, -fno-stack-protector) | ||
12 | CFLAGS_common.o := $(nostackp) | ||
13 | |||
10 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 14 | obj-y := intel_cacheinfo.o addon_cpuid_features.o |
11 | obj-y += proc.o capflags.o powerflags.o common.o | 15 | obj-y += proc.o capflags.o powerflags.o common.o |
12 | obj-y += vmware.o hypervisor.o | 16 | obj-y += vmware.o hypervisor.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8f1b47..63fddcd082cd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | |||
258 | { | 258 | { |
259 | #ifdef CONFIG_X86_HT | 259 | #ifdef CONFIG_X86_HT |
260 | unsigned bits; | 260 | unsigned bits; |
261 | int cpu = smp_processor_id(); | ||
261 | 262 | ||
262 | bits = c->x86_coreid_bits; | 263 | bits = c->x86_coreid_bits; |
263 | |||
264 | /* Low order bits define the core id (index of core in socket) */ | 264 | /* Low order bits define the core id (index of core in socket) */ |
265 | c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); | 265 | c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); |
266 | /* Convert the initial APIC ID into the socket ID */ | 266 | /* Convert the initial APIC ID into the socket ID */ |
267 | c->phys_proc_id = c->initial_apicid >> bits; | 267 | c->phys_proc_id = c->initial_apicid >> bits; |
268 | /* use socket ID also for last level cache */ | ||
269 | per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; | ||
268 | #endif | 270 | #endif |
269 | } | 271 | } |
270 | 272 | ||
@@ -354,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |||
354 | #endif | 356 | #endif |
355 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) | 357 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) |
356 | /* check CPU config space for extended APIC ID */ | 358 | /* check CPU config space for extended APIC ID */ |
357 | if (c->x86 >= 0xf) { | 359 | if (cpu_has_apic && c->x86 >= 0xf) { |
358 | unsigned int val; | 360 | unsigned int val; |
359 | val = read_pci_config(0, 24, 0, 0x68); | 361 | val = read_pci_config(0, 24, 0, 0x68); |
360 | if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) | 362 | if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) |
@@ -398,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
398 | level = cpuid_eax(1); | 400 | level = cpuid_eax(1); |
399 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | 401 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) |
400 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 402 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
403 | |||
404 | /* | ||
405 | * Some BIOSes incorrectly force this feature, but only K8 | ||
406 | * revision D (model = 0x14) and later actually support it. | ||
407 | */ | ||
408 | if (c->x86_model < 0x14) | ||
409 | clear_cpu_cap(c, X86_FEATURE_LAHF_LM); | ||
401 | } | 410 | } |
402 | if (c->x86 == 0x10 || c->x86 == 0x11) | 411 | if (c->x86 == 0x10 || c->x86 == 0x11) |
403 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 412 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9fa33886c0d7..5ce60a88027b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) | |||
59 | alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); | 59 | alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); |
60 | } | 60 | } |
61 | 61 | ||
62 | static const struct cpu_dev *this_cpu __cpuinitdata; | 62 | static void __cpuinit default_init(struct cpuinfo_x86 *c) |
63 | { | ||
64 | #ifdef CONFIG_X86_64 | ||
65 | display_cacheinfo(c); | ||
66 | #else | ||
67 | /* Not much we can do here... */ | ||
68 | /* Check if at least it has cpuid */ | ||
69 | if (c->cpuid_level == -1) { | ||
70 | /* No cpuid. It must be an ancient CPU */ | ||
71 | if (c->x86 == 4) | ||
72 | strcpy(c->x86_model_id, "486"); | ||
73 | else if (c->x86 == 3) | ||
74 | strcpy(c->x86_model_id, "386"); | ||
75 | } | ||
76 | #endif | ||
77 | } | ||
78 | |||
79 | static const struct cpu_dev __cpuinitconst default_cpu = { | ||
80 | .c_init = default_init, | ||
81 | .c_vendor = "Unknown", | ||
82 | .c_x86_vendor = X86_VENDOR_UNKNOWN, | ||
83 | }; | ||
84 | |||
85 | static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; | ||
63 | 86 | ||
64 | DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | 87 | DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { |
65 | #ifdef CONFIG_X86_64 | 88 | #ifdef CONFIG_X86_64 |
@@ -108,7 +131,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | |||
108 | /* data */ | 131 | /* data */ |
109 | [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, | 132 | [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, |
110 | 133 | ||
111 | [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, | 134 | [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, |
112 | [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, | 135 | [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, |
113 | GDT_STACK_CANARY_INIT | 136 | GDT_STACK_CANARY_INIT |
114 | #endif | 137 | #endif |
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) | |||
332 | 355 | ||
333 | static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; | 356 | static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; |
334 | 357 | ||
335 | static void __cpuinit default_init(struct cpuinfo_x86 *c) | ||
336 | { | ||
337 | #ifdef CONFIG_X86_64 | ||
338 | display_cacheinfo(c); | ||
339 | #else | ||
340 | /* Not much we can do here... */ | ||
341 | /* Check if at least it has cpuid */ | ||
342 | if (c->cpuid_level == -1) { | ||
343 | /* No cpuid. It must be an ancient CPU */ | ||
344 | if (c->x86 == 4) | ||
345 | strcpy(c->x86_model_id, "486"); | ||
346 | else if (c->x86 == 3) | ||
347 | strcpy(c->x86_model_id, "386"); | ||
348 | } | ||
349 | #endif | ||
350 | } | ||
351 | |||
352 | static const struct cpu_dev __cpuinitconst default_cpu = { | ||
353 | .c_init = default_init, | ||
354 | .c_vendor = "Unknown", | ||
355 | .c_x86_vendor = X86_VENDOR_UNKNOWN, | ||
356 | }; | ||
357 | |||
358 | static void __cpuinit get_model_name(struct cpuinfo_x86 *c) | 358 | static void __cpuinit get_model_name(struct cpuinfo_x86 *c) |
359 | { | 359 | { |
360 | unsigned int *v; | 360 | unsigned int *v; |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index cf52215d9eb1..2a50ef891000 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -1,3 +1,4 @@ | |||
1 | |||
1 | /* | 2 | /* |
2 | * (c) 2003-2006 Advanced Micro Devices, Inc. | 3 | * (c) 2003-2006 Advanced Micro Devices, Inc. |
3 | * Your use of this code is subject to the terms and conditions of the | 4 | * Your use of this code is subject to the terms and conditions of the |
@@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) | |||
117 | u32 i = 0; | 118 | u32 i = 0; |
118 | 119 | ||
119 | if (cpu_family == CPU_HW_PSTATE) { | 120 | if (cpu_family == CPU_HW_PSTATE) { |
120 | if (data->currpstate == HW_PSTATE_INVALID) { | 121 | rdmsr(MSR_PSTATE_STATUS, lo, hi); |
121 | /* read (initial) hw pstate if not yet set */ | 122 | i = lo & HW_PSTATE_MASK; |
122 | rdmsr(MSR_PSTATE_STATUS, lo, hi); | 123 | data->currpstate = i; |
123 | i = lo & HW_PSTATE_MASK; | 124 | |
124 | 125 | /* | |
125 | /* | 126 | * a workaround for family 11h erratum 311 might cause |
126 | * a workaround for family 11h erratum 311 might cause | 127 | * an "out-of-range Pstate if the core is in Pstate-0 |
127 | * an "out-of-range Pstate if the core is in Pstate-0 | 128 | */ |
128 | */ | 129 | if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) |
129 | if (i >= data->numps) | 130 | data->currpstate = HW_PSTATE_0; |
130 | data->currpstate = HW_PSTATE_0; | 131 | |
131 | else | ||
132 | data->currpstate = i; | ||
133 | } | ||
134 | return 0; | 132 | return 0; |
135 | } | 133 | } |
136 | do { | 134 | do { |
@@ -301,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate) | |||
301 | static int transition_fid_vid(struct powernow_k8_data *data, | 299 | static int transition_fid_vid(struct powernow_k8_data *data, |
302 | u32 reqfid, u32 reqvid) | 300 | u32 reqfid, u32 reqvid) |
303 | { | 301 | { |
304 | if (core_voltage_pre_transition(data, reqvid)) | 302 | if (core_voltage_pre_transition(data, reqvid, reqfid)) |
305 | return 1; | 303 | return 1; |
306 | 304 | ||
307 | if (core_frequency_transition(data, reqfid)) | 305 | if (core_frequency_transition(data, reqfid)) |
@@ -329,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data, | |||
329 | 327 | ||
330 | /* Phase 1 - core voltage transition ... setup voltage */ | 328 | /* Phase 1 - core voltage transition ... setup voltage */ |
331 | static int core_voltage_pre_transition(struct powernow_k8_data *data, | 329 | static int core_voltage_pre_transition(struct powernow_k8_data *data, |
332 | u32 reqvid) | 330 | u32 reqvid, u32 reqfid) |
333 | { | 331 | { |
334 | u32 rvosteps = data->rvo; | 332 | u32 rvosteps = data->rvo; |
335 | u32 savefid = data->currfid; | 333 | u32 savefid = data->currfid; |
336 | u32 maxvid, lo; | 334 | u32 maxvid, lo, rvomult = 1; |
337 | 335 | ||
338 | dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " | 336 | dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " |
339 | "reqvid 0x%x, rvo 0x%x\n", | 337 | "reqvid 0x%x, rvo 0x%x\n", |
340 | smp_processor_id(), | 338 | smp_processor_id(), |
341 | data->currfid, data->currvid, reqvid, data->rvo); | 339 | data->currfid, data->currvid, reqvid, data->rvo); |
342 | 340 | ||
341 | if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) | ||
342 | rvomult = 2; | ||
343 | rvosteps *= rvomult; | ||
343 | rdmsr(MSR_FIDVID_STATUS, lo, maxvid); | 344 | rdmsr(MSR_FIDVID_STATUS, lo, maxvid); |
344 | maxvid = 0x1f & (maxvid >> 16); | 345 | maxvid = 0x1f & (maxvid >> 16); |
345 | dprintk("ph1 maxvid=0x%x\n", maxvid); | 346 | dprintk("ph1 maxvid=0x%x\n", maxvid); |
@@ -353,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, | |||
353 | return 1; | 354 | return 1; |
354 | } | 355 | } |
355 | 356 | ||
356 | while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { | 357 | while ((rvosteps > 0) && |
358 | ((rvomult * data->rvo + data->currvid) > reqvid)) { | ||
357 | if (data->currvid == maxvid) { | 359 | if (data->currvid == maxvid) { |
358 | rvosteps = 0; | 360 | rvosteps = 0; |
359 | } else { | 361 | } else { |
@@ -386,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) | |||
386 | u32 vcoreqfid, vcocurrfid, vcofiddiff; | 388 | u32 vcoreqfid, vcocurrfid, vcofiddiff; |
387 | u32 fid_interval, savevid = data->currvid; | 389 | u32 fid_interval, savevid = data->currvid; |
388 | 390 | ||
389 | if ((reqfid < HI_FID_TABLE_BOTTOM) && | ||
390 | (data->currfid < HI_FID_TABLE_BOTTOM)) { | ||
391 | printk(KERN_ERR PFX "ph2: illegal lo-lo transition " | ||
392 | "0x%x 0x%x\n", reqfid, data->currfid); | ||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | if (data->currfid == reqfid) { | 391 | if (data->currfid == reqfid) { |
397 | printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", | 392 | printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", |
398 | data->currfid); | 393 | data->currfid); |
@@ -409,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) | |||
409 | vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid | 404 | vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid |
410 | : vcoreqfid - vcocurrfid; | 405 | : vcoreqfid - vcocurrfid; |
411 | 406 | ||
407 | if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) | ||
408 | vcofiddiff = 0; | ||
409 | |||
412 | while (vcofiddiff > 2) { | 410 | while (vcofiddiff > 2) { |
413 | (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); | 411 | (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); |
414 | 412 | ||
@@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, | |||
510 | return 0; | 508 | return 0; |
511 | } | 509 | } |
512 | 510 | ||
513 | static int check_supported_cpu(unsigned int cpu) | 511 | static void check_supported_cpu(void *_rc) |
514 | { | 512 | { |
515 | cpumask_t oldmask; | ||
516 | u32 eax, ebx, ecx, edx; | 513 | u32 eax, ebx, ecx, edx; |
517 | unsigned int rc = 0; | 514 | int *rc = _rc; |
518 | 515 | ||
519 | oldmask = current->cpus_allowed; | 516 | *rc = -ENODEV; |
520 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
521 | |||
522 | if (smp_processor_id() != cpu) { | ||
523 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); | ||
524 | goto out; | ||
525 | } | ||
526 | 517 | ||
527 | if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) | 518 | if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) |
528 | goto out; | 519 | return; |
529 | 520 | ||
530 | eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); | 521 | eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); |
531 | if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && | 522 | if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && |
532 | ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) | 523 | ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) |
533 | goto out; | 524 | return; |
534 | 525 | ||
535 | if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { | 526 | if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { |
536 | if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || | 527 | if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || |
537 | ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { | 528 | ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { |
538 | printk(KERN_INFO PFX | 529 | printk(KERN_INFO PFX |
539 | "Processor cpuid %x not supported\n", eax); | 530 | "Processor cpuid %x not supported\n", eax); |
540 | goto out; | 531 | return; |
541 | } | 532 | } |
542 | 533 | ||
543 | eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); | 534 | eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); |
544 | if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { | 535 | if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { |
545 | printk(KERN_INFO PFX | 536 | printk(KERN_INFO PFX |
546 | "No frequency change capabilities detected\n"); | 537 | "No frequency change capabilities detected\n"); |
547 | goto out; | 538 | return; |
548 | } | 539 | } |
549 | 540 | ||
550 | cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); | 541 | cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); |
@@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu) | |||
552 | != P_STATE_TRANSITION_CAPABLE) { | 543 | != P_STATE_TRANSITION_CAPABLE) { |
553 | printk(KERN_INFO PFX | 544 | printk(KERN_INFO PFX |
554 | "Power state transitions not supported\n"); | 545 | "Power state transitions not supported\n"); |
555 | goto out; | 546 | return; |
556 | } | 547 | } |
557 | } else { /* must be a HW Pstate capable processor */ | 548 | } else { /* must be a HW Pstate capable processor */ |
558 | cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); | 549 | cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); |
559 | if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) | 550 | if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) |
560 | cpu_family = CPU_HW_PSTATE; | 551 | cpu_family = CPU_HW_PSTATE; |
561 | else | 552 | else |
562 | goto out; | 553 | return; |
563 | } | 554 | } |
564 | 555 | ||
565 | rc = 1; | 556 | *rc = 0; |
566 | |||
567 | out: | ||
568 | set_cpus_allowed_ptr(current, &oldmask); | ||
569 | return rc; | ||
570 | } | 557 | } |
571 | 558 | ||
572 | static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, | 559 | static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, |
@@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, | |||
823 | if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) | 810 | if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) |
824 | return; | 811 | return; |
825 | 812 | ||
826 | control = data->acpi_data.states[index].control; data->irt = (control | 813 | control = data->acpi_data.states[index].control; |
827 | >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> | 814 | data->irt = (control >> IRT_SHIFT) & IRT_MASK; |
828 | RVO_SHIFT) & RVO_MASK; data->exttype = (control | 815 | data->rvo = (control >> RVO_SHIFT) & RVO_MASK; |
829 | >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; | 816 | data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; |
830 | data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 | 817 | data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; |
831 | << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = | 818 | data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); |
832 | (control >> VST_SHIFT) & VST_MASK; } | 819 | data->vstable = (control >> VST_SHIFT) & VST_MASK; |
820 | } | ||
833 | 821 | ||
834 | static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) | 822 | static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) |
835 | { | 823 | { |
@@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data) | |||
1046 | if (cur_latency > max_latency) | 1034 | if (cur_latency > max_latency) |
1047 | max_latency = cur_latency; | 1035 | max_latency = cur_latency; |
1048 | } | 1036 | } |
1037 | if (max_latency == 0) { | ||
1038 | /* | ||
1039 | * Fam 11h always returns 0 as transition latency. | ||
1040 | * This is intended and means "very fast". While cpufreq core | ||
1041 | * and governors currently can handle that gracefully, better | ||
1042 | * set it to 1 to avoid problems in the future. | ||
1043 | * For all others it's a BIOS bug. | ||
1044 | */ | ||
1045 | if (!boot_cpu_data.x86 == 0x11) | ||
1046 | printk(KERN_ERR FW_WARN PFX "Invalid zero transition " | ||
1047 | "latency\n"); | ||
1048 | max_latency = 1; | ||
1049 | } | ||
1049 | /* value in usecs, needs to be in nanoseconds */ | 1050 | /* value in usecs, needs to be in nanoseconds */ |
1050 | return 1000 * max_latency; | 1051 | return 1000 * max_latency; |
1051 | } | 1052 | } |
@@ -1080,20 +1081,12 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, | |||
1080 | return 0; | 1081 | return 0; |
1081 | } | 1082 | } |
1082 | 1083 | ||
1083 | if ((fid < HI_FID_TABLE_BOTTOM) && | ||
1084 | (data->currfid < HI_FID_TABLE_BOTTOM)) { | ||
1085 | printk(KERN_ERR PFX | ||
1086 | "ignoring illegal change in lo freq table-%x to 0x%x\n", | ||
1087 | data->currfid, fid); | ||
1088 | return 1; | ||
1089 | } | ||
1090 | |||
1091 | dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", | 1084 | dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", |
1092 | smp_processor_id(), fid, vid); | 1085 | smp_processor_id(), fid, vid); |
1093 | freqs.old = find_khz_freq_from_fid(data->currfid); | 1086 | freqs.old = find_khz_freq_from_fid(data->currfid); |
1094 | freqs.new = find_khz_freq_from_fid(fid); | 1087 | freqs.new = find_khz_freq_from_fid(fid); |
1095 | 1088 | ||
1096 | for_each_cpu_mask_nr(i, *(data->available_cores)) { | 1089 | for_each_cpu(i, data->available_cores) { |
1097 | freqs.cpu = i; | 1090 | freqs.cpu = i; |
1098 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 1091 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
1099 | } | 1092 | } |
@@ -1101,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, | |||
1101 | res = transition_fid_vid(data, fid, vid); | 1094 | res = transition_fid_vid(data, fid, vid); |
1102 | freqs.new = find_khz_freq_from_fid(data->currfid); | 1095 | freqs.new = find_khz_freq_from_fid(data->currfid); |
1103 | 1096 | ||
1104 | for_each_cpu_mask_nr(i, *(data->available_cores)) { | 1097 | for_each_cpu(i, data->available_cores) { |
1105 | freqs.cpu = i; | 1098 | freqs.cpu = i; |
1106 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 1099 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
1107 | } | 1100 | } |
@@ -1126,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, | |||
1126 | data->currpstate); | 1119 | data->currpstate); |
1127 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); | 1120 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); |
1128 | 1121 | ||
1129 | for_each_cpu_mask_nr(i, *(data->available_cores)) { | 1122 | for_each_cpu(i, data->available_cores) { |
1130 | freqs.cpu = i; | 1123 | freqs.cpu = i; |
1131 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 1124 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
1132 | } | 1125 | } |
@@ -1134,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, | |||
1134 | res = transition_pstate(data, pstate); | 1127 | res = transition_pstate(data, pstate); |
1135 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); | 1128 | freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); |
1136 | 1129 | ||
1137 | for_each_cpu_mask_nr(i, *(data->available_cores)) { | 1130 | for_each_cpu(i, data->available_cores) { |
1138 | freqs.cpu = i; | 1131 | freqs.cpu = i; |
1139 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 1132 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
1140 | } | 1133 | } |
@@ -1235,21 +1228,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol) | |||
1235 | return cpufreq_frequency_table_verify(pol, data->powernow_table); | 1228 | return cpufreq_frequency_table_verify(pol, data->powernow_table); |
1236 | } | 1229 | } |
1237 | 1230 | ||
1238 | static const char ACPI_PSS_BIOS_BUG_MSG[] = | 1231 | struct init_on_cpu { |
1239 | KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" | 1232 | struct powernow_k8_data *data; |
1240 | KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; | 1233 | int rc; |
1234 | }; | ||
1235 | |||
1236 | static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) | ||
1237 | { | ||
1238 | struct init_on_cpu *init_on_cpu = _init_on_cpu; | ||
1239 | |||
1240 | if (pending_bit_stuck()) { | ||
1241 | printk(KERN_ERR PFX "failing init, change pending bit set\n"); | ||
1242 | init_on_cpu->rc = -ENODEV; | ||
1243 | return; | ||
1244 | } | ||
1245 | |||
1246 | if (query_current_values_with_pending_wait(init_on_cpu->data)) { | ||
1247 | init_on_cpu->rc = -ENODEV; | ||
1248 | return; | ||
1249 | } | ||
1250 | |||
1251 | if (cpu_family == CPU_OPTERON) | ||
1252 | fidvid_msr_init(); | ||
1253 | |||
1254 | init_on_cpu->rc = 0; | ||
1255 | } | ||
1241 | 1256 | ||
1242 | /* per CPU init entry point to the driver */ | 1257 | /* per CPU init entry point to the driver */ |
1243 | static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | 1258 | static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) |
1244 | { | 1259 | { |
1260 | static const char ACPI_PSS_BIOS_BUG_MSG[] = | ||
1261 | KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" | ||
1262 | FW_BUG PFX "Try again with latest BIOS.\n"; | ||
1245 | struct powernow_k8_data *data; | 1263 | struct powernow_k8_data *data; |
1246 | cpumask_t oldmask; | 1264 | struct init_on_cpu init_on_cpu; |
1247 | int rc; | 1265 | int rc; |
1248 | 1266 | ||
1249 | if (!cpu_online(pol->cpu)) | 1267 | if (!cpu_online(pol->cpu)) |
1250 | return -ENODEV; | 1268 | return -ENODEV; |
1251 | 1269 | ||
1252 | if (!check_supported_cpu(pol->cpu)) | 1270 | smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); |
1271 | if (rc) | ||
1253 | return -ENODEV; | 1272 | return -ENODEV; |
1254 | 1273 | ||
1255 | data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); | 1274 | data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); |
@@ -1289,27 +1308,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1289 | pol->cpuinfo.transition_latency = get_transition_latency(data); | 1308 | pol->cpuinfo.transition_latency = get_transition_latency(data); |
1290 | 1309 | ||
1291 | /* only run on specific CPU from here on */ | 1310 | /* only run on specific CPU from here on */ |
1292 | oldmask = current->cpus_allowed; | 1311 | init_on_cpu.data = data; |
1293 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); | 1312 | smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, |
1294 | 1313 | &init_on_cpu, 1); | |
1295 | if (smp_processor_id() != pol->cpu) { | 1314 | rc = init_on_cpu.rc; |
1296 | printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); | 1315 | if (rc != 0) |
1297 | goto err_out_unmask; | 1316 | goto err_out_exit_acpi; |
1298 | } | ||
1299 | |||
1300 | if (pending_bit_stuck()) { | ||
1301 | printk(KERN_ERR PFX "failing init, change pending bit set\n"); | ||
1302 | goto err_out_unmask; | ||
1303 | } | ||
1304 | |||
1305 | if (query_current_values_with_pending_wait(data)) | ||
1306 | goto err_out_unmask; | ||
1307 | |||
1308 | if (cpu_family == CPU_OPTERON) | ||
1309 | fidvid_msr_init(); | ||
1310 | |||
1311 | /* run on any CPU again */ | ||
1312 | set_cpus_allowed_ptr(current, &oldmask); | ||
1313 | 1317 | ||
1314 | if (cpu_family == CPU_HW_PSTATE) | 1318 | if (cpu_family == CPU_HW_PSTATE) |
1315 | cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); | 1319 | cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); |
@@ -1346,8 +1350,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) | |||
1346 | 1350 | ||
1347 | return 0; | 1351 | return 0; |
1348 | 1352 | ||
1349 | err_out_unmask: | 1353 | err_out_exit_acpi: |
1350 | set_cpus_allowed_ptr(current, &oldmask); | ||
1351 | powernow_k8_cpu_exit_acpi(data); | 1354 | powernow_k8_cpu_exit_acpi(data); |
1352 | 1355 | ||
1353 | err_out: | 1356 | err_out: |
@@ -1372,28 +1375,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) | |||
1372 | return 0; | 1375 | return 0; |
1373 | } | 1376 | } |
1374 | 1377 | ||
1378 | static void query_values_on_cpu(void *_err) | ||
1379 | { | ||
1380 | int *err = _err; | ||
1381 | struct powernow_k8_data *data = __get_cpu_var(powernow_data); | ||
1382 | |||
1383 | *err = query_current_values_with_pending_wait(data); | ||
1384 | } | ||
1385 | |||
1375 | static unsigned int powernowk8_get(unsigned int cpu) | 1386 | static unsigned int powernowk8_get(unsigned int cpu) |
1376 | { | 1387 | { |
1377 | struct powernow_k8_data *data; | 1388 | struct powernow_k8_data *data = per_cpu(powernow_data, cpu); |
1378 | cpumask_t oldmask = current->cpus_allowed; | ||
1379 | unsigned int khz = 0; | 1389 | unsigned int khz = 0; |
1380 | unsigned int first; | 1390 | int err; |
1381 | |||
1382 | first = cpumask_first(cpu_core_mask(cpu)); | ||
1383 | data = per_cpu(powernow_data, first); | ||
1384 | 1391 | ||
1385 | if (!data) | 1392 | if (!data) |
1386 | return -EINVAL; | 1393 | return -EINVAL; |
1387 | 1394 | ||
1388 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | 1395 | smp_call_function_single(cpu, query_values_on_cpu, &err, true); |
1389 | if (smp_processor_id() != cpu) { | 1396 | if (err) |
1390 | printk(KERN_ERR PFX | ||
1391 | "limiting to CPU %d failed in powernowk8_get\n", cpu); | ||
1392 | set_cpus_allowed_ptr(current, &oldmask); | ||
1393 | return 0; | ||
1394 | } | ||
1395 | |||
1396 | if (query_current_values_with_pending_wait(data)) | ||
1397 | goto out; | 1397 | goto out; |
1398 | 1398 | ||
1399 | if (cpu_family == CPU_HW_PSTATE) | 1399 | if (cpu_family == CPU_HW_PSTATE) |
@@ -1404,7 +1404,6 @@ static unsigned int powernowk8_get(unsigned int cpu) | |||
1404 | 1404 | ||
1405 | 1405 | ||
1406 | out: | 1406 | out: |
1407 | set_cpus_allowed_ptr(current, &oldmask); | ||
1408 | return khz; | 1407 | return khz; |
1409 | } | 1408 | } |
1410 | 1409 | ||
@@ -1430,7 +1429,9 @@ static int __cpuinit powernowk8_init(void) | |||
1430 | unsigned int i, supported_cpus = 0; | 1429 | unsigned int i, supported_cpus = 0; |
1431 | 1430 | ||
1432 | for_each_online_cpu(i) { | 1431 | for_each_online_cpu(i) { |
1433 | if (check_supported_cpu(i)) | 1432 | int rc; |
1433 | smp_call_function_single(i, check_supported_cpu, &rc, 1); | ||
1434 | if (rc == 0) | ||
1434 | supported_cpus++; | 1435 | supported_cpus++; |
1435 | } | 1436 | } |
1436 | 1437 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 6c6698feade1..02ce824073cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h | |||
@@ -215,7 +215,8 @@ struct pst_s { | |||
215 | 215 | ||
216 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) | 216 | #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) |
217 | 217 | ||
218 | static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); | 218 | static int core_voltage_pre_transition(struct powernow_k8_data *data, |
219 | u32 reqvid, u32 regfid); | ||
219 | static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); | 220 | static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); |
220 | static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); | 221 | static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); |
221 | 222 | ||
@@ -223,14 +224,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned | |||
223 | 224 | ||
224 | static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); | 225 | static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); |
225 | static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); | 226 | static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); |
226 | |||
227 | #ifdef CONFIG_SMP | ||
228 | static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) | ||
229 | { | ||
230 | } | ||
231 | #else | ||
232 | static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) | ||
233 | { | ||
234 | cpu_set(0, cpu_sharedcore_mask[0]); | ||
235 | } | ||
236 | #endif | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 55c831ed71ce..8d672ef162ce 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | |||
@@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu) | |||
323 | { | 323 | { |
324 | unsigned l, h; | 324 | unsigned l, h; |
325 | unsigned clock_freq; | 325 | unsigned clock_freq; |
326 | cpumask_t saved_mask; | ||
327 | 326 | ||
328 | saved_mask = current->cpus_allowed; | 327 | rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); |
329 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
330 | if (smp_processor_id() != cpu) | ||
331 | return 0; | ||
332 | |||
333 | rdmsr(MSR_IA32_PERF_STATUS, l, h); | ||
334 | clock_freq = extract_clock(l, cpu, 0); | 328 | clock_freq = extract_clock(l, cpu, 0); |
335 | 329 | ||
336 | if (unlikely(clock_freq == 0)) { | 330 | if (unlikely(clock_freq == 0)) { |
@@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu) | |||
340 | * P-state transition (like TM2). Get the last freq set | 334 | * P-state transition (like TM2). Get the last freq set |
341 | * in PERF_CTL. | 335 | * in PERF_CTL. |
342 | */ | 336 | */ |
343 | rdmsr(MSR_IA32_PERF_CTL, l, h); | 337 | rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); |
344 | clock_freq = extract_clock(l, cpu, 1); | 338 | clock_freq = extract_clock(l, cpu, 1); |
345 | } | 339 | } |
346 | |||
347 | set_cpus_allowed_ptr(current, &saved_mask); | ||
348 | return clock_freq; | 340 | return clock_freq; |
349 | } | 341 | } |
350 | 342 | ||
@@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
467 | struct cpufreq_freqs freqs; | 459 | struct cpufreq_freqs freqs; |
468 | int retval = 0; | 460 | int retval = 0; |
469 | unsigned int j, k, first_cpu, tmp; | 461 | unsigned int j, k, first_cpu, tmp; |
470 | cpumask_var_t saved_mask, covered_cpus; | 462 | cpumask_var_t covered_cpus; |
471 | 463 | ||
472 | if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) | 464 | if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) |
473 | return -ENOMEM; | ||
474 | if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { | ||
475 | free_cpumask_var(saved_mask); | ||
476 | return -ENOMEM; | 465 | return -ENOMEM; |
477 | } | ||
478 | cpumask_copy(saved_mask, ¤t->cpus_allowed); | ||
479 | 466 | ||
480 | if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { | 467 | if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { |
481 | retval = -ENODEV; | 468 | retval = -ENODEV; |
@@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
493 | 480 | ||
494 | first_cpu = 1; | 481 | first_cpu = 1; |
495 | for_each_cpu(j, policy->cpus) { | 482 | for_each_cpu(j, policy->cpus) { |
496 | const struct cpumask *mask; | 483 | int good_cpu; |
497 | 484 | ||
498 | /* cpufreq holds the hotplug lock, so we are safe here */ | 485 | /* cpufreq holds the hotplug lock, so we are safe here */ |
499 | if (!cpu_online(j)) | 486 | if (!cpu_online(j)) |
@@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
504 | * Make sure we are running on CPU that wants to change freq | 491 | * Make sure we are running on CPU that wants to change freq |
505 | */ | 492 | */ |
506 | if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) | 493 | if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) |
507 | mask = policy->cpus; | 494 | good_cpu = cpumask_any_and(policy->cpus, |
495 | cpu_online_mask); | ||
508 | else | 496 | else |
509 | mask = cpumask_of(j); | 497 | good_cpu = j; |
510 | 498 | ||
511 | set_cpus_allowed_ptr(current, mask); | 499 | if (good_cpu >= nr_cpu_ids) { |
512 | preempt_disable(); | ||
513 | if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { | ||
514 | dprintk("couldn't limit to CPUs in this domain\n"); | 500 | dprintk("couldn't limit to CPUs in this domain\n"); |
515 | retval = -EAGAIN; | 501 | retval = -EAGAIN; |
516 | if (first_cpu) { | 502 | if (first_cpu) { |
517 | /* We haven't started the transition yet. */ | 503 | /* We haven't started the transition yet. */ |
518 | goto migrate_end; | 504 | goto out; |
519 | } | 505 | } |
520 | preempt_enable(); | ||
521 | break; | 506 | break; |
522 | } | 507 | } |
523 | 508 | ||
524 | msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; | 509 | msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; |
525 | 510 | ||
526 | if (first_cpu) { | 511 | if (first_cpu) { |
527 | rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); | 512 | rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); |
528 | if (msr == (oldmsr & 0xffff)) { | 513 | if (msr == (oldmsr & 0xffff)) { |
529 | dprintk("no change needed - msr was and needs " | 514 | dprintk("no change needed - msr was and needs " |
530 | "to be %x\n", oldmsr); | 515 | "to be %x\n", oldmsr); |
531 | retval = 0; | 516 | retval = 0; |
532 | goto migrate_end; | 517 | goto out; |
533 | } | 518 | } |
534 | 519 | ||
535 | freqs.old = extract_clock(oldmsr, cpu, 0); | 520 | freqs.old = extract_clock(oldmsr, cpu, 0); |
@@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
553 | oldmsr |= msr; | 538 | oldmsr |= msr; |
554 | } | 539 | } |
555 | 540 | ||
556 | wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); | 541 | wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); |
557 | if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { | 542 | if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) |
558 | preempt_enable(); | ||
559 | break; | 543 | break; |
560 | } | ||
561 | 544 | ||
562 | cpu_set(j, *covered_cpus); | 545 | cpumask_set_cpu(j, covered_cpus); |
563 | preempt_enable(); | ||
564 | } | 546 | } |
565 | 547 | ||
566 | for_each_cpu(k, policy->cpus) { | 548 | for_each_cpu(k, policy->cpus) { |
@@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
578 | * Best effort undo.. | 560 | * Best effort undo.. |
579 | */ | 561 | */ |
580 | 562 | ||
581 | for_each_cpu_mask_nr(j, *covered_cpus) { | 563 | for_each_cpu(j, covered_cpus) |
582 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); | 564 | wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); |
583 | wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); | ||
584 | } | ||
585 | 565 | ||
586 | tmp = freqs.new; | 566 | tmp = freqs.new; |
587 | freqs.new = freqs.old; | 567 | freqs.new = freqs.old; |
@@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy, | |||
593 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); | 573 | cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); |
594 | } | 574 | } |
595 | } | 575 | } |
596 | set_cpus_allowed_ptr(current, saved_mask); | ||
597 | retval = 0; | 576 | retval = 0; |
598 | goto out; | ||
599 | 577 | ||
600 | migrate_end: | ||
601 | preempt_enable(); | ||
602 | set_cpus_allowed_ptr(current, saved_mask); | ||
603 | out: | 578 | out: |
604 | free_cpumask_var(saved_mask); | ||
605 | free_cpumask_var(covered_cpus); | 579 | free_cpumask_var(covered_cpus); |
606 | return retval; | 580 | return retval; |
607 | } | 581 | } |
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 016c1a4fa3fc..6911e91fb4f6 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | |||
@@ -89,7 +89,8 @@ static int speedstep_find_register(void) | |||
89 | * speedstep_set_state - set the SpeedStep state | 89 | * speedstep_set_state - set the SpeedStep state |
90 | * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) | 90 | * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) |
91 | * | 91 | * |
92 | * Tries to change the SpeedStep state. | 92 | * Tries to change the SpeedStep state. Can be called from |
93 | * smp_call_function_single. | ||
93 | */ | 94 | */ |
94 | static void speedstep_set_state(unsigned int state) | 95 | static void speedstep_set_state(unsigned int state) |
95 | { | 96 | { |
@@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state) | |||
143 | return; | 144 | return; |
144 | } | 145 | } |
145 | 146 | ||
147 | /* Wrapper for smp_call_function_single. */ | ||
148 | static void _speedstep_set_state(void *_state) | ||
149 | { | ||
150 | speedstep_set_state(*(unsigned int *)_state); | ||
151 | } | ||
146 | 152 | ||
147 | /** | 153 | /** |
148 | * speedstep_activate - activate SpeedStep control in the chipset | 154 | * speedstep_activate - activate SpeedStep control in the chipset |
@@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void) | |||
226 | return 0; | 232 | return 0; |
227 | } | 233 | } |
228 | 234 | ||
229 | static unsigned int _speedstep_get(const struct cpumask *cpus) | 235 | struct get_freq_data { |
230 | { | ||
231 | unsigned int speed; | 236 | unsigned int speed; |
232 | cpumask_t cpus_allowed; | 237 | unsigned int processor; |
233 | 238 | }; | |
234 | cpus_allowed = current->cpus_allowed; | 239 | |
235 | set_cpus_allowed_ptr(current, cpus); | 240 | static void get_freq_data(void *_data) |
236 | speed = speedstep_get_frequency(speedstep_processor); | 241 | { |
237 | set_cpus_allowed_ptr(current, &cpus_allowed); | 242 | struct get_freq_data *data = _data; |
238 | dprintk("detected %u kHz as current frequency\n", speed); | 243 | |
239 | return speed; | 244 | data->speed = speedstep_get_frequency(data->processor); |
240 | } | 245 | } |
241 | 246 | ||
242 | static unsigned int speedstep_get(unsigned int cpu) | 247 | static unsigned int speedstep_get(unsigned int cpu) |
243 | { | 248 | { |
244 | return _speedstep_get(cpumask_of(cpu)); | 249 | struct get_freq_data data = { .processor = cpu }; |
250 | |||
251 | /* You're supposed to ensure CPU is online. */ | ||
252 | if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) | ||
253 | BUG(); | ||
254 | |||
255 | dprintk("detected %u kHz as current frequency\n", data.speed); | ||
256 | return data.speed; | ||
245 | } | 257 | } |
246 | 258 | ||
247 | /** | 259 | /** |
@@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy, | |||
257 | unsigned int target_freq, | 269 | unsigned int target_freq, |
258 | unsigned int relation) | 270 | unsigned int relation) |
259 | { | 271 | { |
260 | unsigned int newstate = 0; | 272 | unsigned int newstate = 0, policy_cpu; |
261 | struct cpufreq_freqs freqs; | 273 | struct cpufreq_freqs freqs; |
262 | cpumask_t cpus_allowed; | ||
263 | int i; | 274 | int i; |
264 | 275 | ||
265 | if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], | 276 | if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], |
266 | target_freq, relation, &newstate)) | 277 | target_freq, relation, &newstate)) |
267 | return -EINVAL; | 278 | return -EINVAL; |
268 | 279 | ||
269 | freqs.old = _speedstep_get(policy->cpus); | 280 | policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); |
281 | freqs.old = speedstep_get(policy_cpu); | ||
270 | freqs.new = speedstep_freqs[newstate].frequency; | 282 | freqs.new = speedstep_freqs[newstate].frequency; |
271 | freqs.cpu = policy->cpu; | 283 | freqs.cpu = policy->cpu; |
272 | 284 | ||
@@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy, | |||
276 | if (freqs.old == freqs.new) | 288 | if (freqs.old == freqs.new) |
277 | return 0; | 289 | return 0; |
278 | 290 | ||
279 | cpus_allowed = current->cpus_allowed; | ||
280 | |||
281 | for_each_cpu(i, policy->cpus) { | 291 | for_each_cpu(i, policy->cpus) { |
282 | freqs.cpu = i; | 292 | freqs.cpu = i; |
283 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); | 293 | cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); |
284 | } | 294 | } |
285 | 295 | ||
286 | /* switch to physical CPU where state is to be changed */ | 296 | smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, |
287 | set_cpus_allowed_ptr(current, policy->cpus); | 297 | true); |
288 | |||
289 | speedstep_set_state(newstate); | ||
290 | |||
291 | /* allow to be run on all CPUs */ | ||
292 | set_cpus_allowed_ptr(current, &cpus_allowed); | ||
293 | 298 | ||
294 | for_each_cpu(i, policy->cpus) { | 299 | for_each_cpu(i, policy->cpus) { |
295 | freqs.cpu = i; | 300 | freqs.cpu = i; |
@@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy) | |||
312 | return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); | 317 | return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); |
313 | } | 318 | } |
314 | 319 | ||
320 | struct get_freqs { | ||
321 | struct cpufreq_policy *policy; | ||
322 | int ret; | ||
323 | }; | ||
324 | |||
325 | static void get_freqs_on_cpu(void *_get_freqs) | ||
326 | { | ||
327 | struct get_freqs *get_freqs = _get_freqs; | ||
328 | |||
329 | get_freqs->ret = | ||
330 | speedstep_get_freqs(speedstep_processor, | ||
331 | &speedstep_freqs[SPEEDSTEP_LOW].frequency, | ||
332 | &speedstep_freqs[SPEEDSTEP_HIGH].frequency, | ||
333 | &get_freqs->policy->cpuinfo.transition_latency, | ||
334 | &speedstep_set_state); | ||
335 | } | ||
315 | 336 | ||
316 | static int speedstep_cpu_init(struct cpufreq_policy *policy) | 337 | static int speedstep_cpu_init(struct cpufreq_policy *policy) |
317 | { | 338 | { |
318 | int result = 0; | 339 | int result; |
319 | unsigned int speed; | 340 | unsigned int policy_cpu, speed; |
320 | cpumask_t cpus_allowed; | 341 | struct get_freqs gf; |
321 | 342 | ||
322 | /* only run on CPU to be set, or on its sibling */ | 343 | /* only run on CPU to be set, or on its sibling */ |
323 | #ifdef CONFIG_SMP | 344 | #ifdef CONFIG_SMP |
324 | cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); | 345 | cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); |
325 | #endif | 346 | #endif |
326 | 347 | policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); | |
327 | cpus_allowed = current->cpus_allowed; | ||
328 | set_cpus_allowed_ptr(current, policy->cpus); | ||
329 | 348 | ||
330 | /* detect low and high frequency and transition latency */ | 349 | /* detect low and high frequency and transition latency */ |
331 | result = speedstep_get_freqs(speedstep_processor, | 350 | gf.policy = policy; |
332 | &speedstep_freqs[SPEEDSTEP_LOW].frequency, | 351 | smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); |
333 | &speedstep_freqs[SPEEDSTEP_HIGH].frequency, | 352 | if (gf.ret) |
334 | &policy->cpuinfo.transition_latency, | 353 | return gf.ret; |
335 | &speedstep_set_state); | ||
336 | set_cpus_allowed_ptr(current, &cpus_allowed); | ||
337 | if (result) | ||
338 | return result; | ||
339 | 354 | ||
340 | /* get current speed setting */ | 355 | /* get current speed setting */ |
341 | speed = _speedstep_get(policy->cpus); | 356 | speed = speedstep_get(policy_cpu); |
342 | if (!speed) | 357 | if (!speed) |
343 | return -EIO; | 358 | return -EIO; |
344 | 359 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 2e3c6862657b..f4c290b8482f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | |||
@@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void) | |||
226 | } | 226 | } |
227 | 227 | ||
228 | 228 | ||
229 | /* Warning: may get called from smp_call_function_single. */ | ||
229 | unsigned int speedstep_get_frequency(unsigned int processor) | 230 | unsigned int speedstep_get_frequency(unsigned int processor) |
230 | { | 231 | { |
231 | switch (processor) { | 232 | switch (processor) { |
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 45004faf67ea..188a1ca5ad2b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
@@ -1,11 +1,12 @@ | |||
1 | obj-y = mce.o therm_throt.o | 1 | obj-y = mce.o |
2 | 2 | ||
3 | obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o | 3 | obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o |
4 | obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o | 4 | obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o |
5 | obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o | 5 | obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o |
6 | obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o | 6 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o |
7 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o | 7 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o |
8 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o | ||
9 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o | 8 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o |
10 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | 9 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o |
11 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o | 10 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o |
11 | |||
12 | obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o | ||
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index 89e510424152..b945d5dbc609 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c | |||
@@ -10,10 +10,9 @@ | |||
10 | 10 | ||
11 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 12 | #include <asm/system.h> |
13 | #include <asm/mce.h> | ||
13 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
14 | 15 | ||
15 | #include "mce.h" | ||
16 | |||
17 | /* Machine Check Handler For AMD Athlon/Duron: */ | 16 | /* Machine Check Handler For AMD Athlon/Duron: */ |
18 | static void k7_machine_check(struct pt_regs *regs, long error_code) | 17 | static void k7_machine_check(struct pt_regs *regs, long error_code) |
19 | { | 18 | { |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fabba15e4558..01213048f62f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -44,7 +44,6 @@ | |||
44 | #include <asm/msr.h> | 44 | #include <asm/msr.h> |
45 | 45 | ||
46 | #include "mce-internal.h" | 46 | #include "mce-internal.h" |
47 | #include "mce.h" | ||
48 | 47 | ||
49 | /* Handle unconfigured int18 (should never happen) */ | 48 | /* Handle unconfigured int18 (should never happen) */ |
50 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | 49 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) |
@@ -57,7 +56,7 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) | |||
57 | void (*machine_check_vector)(struct pt_regs *, long error_code) = | 56 | void (*machine_check_vector)(struct pt_regs *, long error_code) = |
58 | unexpected_machine_check; | 57 | unexpected_machine_check; |
59 | 58 | ||
60 | int mce_disabled; | 59 | int mce_disabled __read_mostly; |
61 | 60 | ||
62 | #ifdef CONFIG_X86_NEW_MCE | 61 | #ifdef CONFIG_X86_NEW_MCE |
63 | 62 | ||
@@ -76,21 +75,22 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); | |||
76 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | 75 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors |
77 | * 3: never panic or SIGBUS, log all errors (for testing only) | 76 | * 3: never panic or SIGBUS, log all errors (for testing only) |
78 | */ | 77 | */ |
79 | static int tolerant = 1; | 78 | static int tolerant __read_mostly = 1; |
80 | static int banks; | 79 | static int banks __read_mostly; |
81 | static u64 *bank; | 80 | static u64 *bank __read_mostly; |
82 | static unsigned long notify_user; | 81 | static int rip_msr __read_mostly; |
83 | static int rip_msr; | 82 | static int mce_bootlog __read_mostly = -1; |
84 | static int mce_bootlog = -1; | 83 | static int monarch_timeout __read_mostly = -1; |
85 | static int monarch_timeout = -1; | 84 | static int mce_panic_timeout __read_mostly; |
86 | static int mce_panic_timeout; | 85 | static int mce_dont_log_ce __read_mostly; |
87 | static int mce_dont_log_ce; | 86 | int mce_cmci_disabled __read_mostly; |
88 | int mce_cmci_disabled; | 87 | int mce_ignore_ce __read_mostly; |
89 | int mce_ignore_ce; | 88 | int mce_ser __read_mostly; |
90 | int mce_ser; | 89 | |
91 | 90 | /* User mode helper program triggered by machine check event */ | |
92 | static char trigger[128]; | 91 | static unsigned long mce_need_notify; |
93 | static char *trigger_argv[2] = { trigger, NULL }; | 92 | static char mce_helper[128]; |
93 | static char *mce_helper_argv[2] = { mce_helper, NULL }; | ||
94 | 94 | ||
95 | static unsigned long dont_init_banks; | 95 | static unsigned long dont_init_banks; |
96 | 96 | ||
@@ -180,7 +180,7 @@ void mce_log(struct mce *mce) | |||
180 | wmb(); | 180 | wmb(); |
181 | 181 | ||
182 | mce->finished = 1; | 182 | mce->finished = 1; |
183 | set_bit(0, ¬ify_user); | 183 | set_bit(0, &mce_need_notify); |
184 | } | 184 | } |
185 | 185 | ||
186 | static void print_mce(struct mce *m) | 186 | static void print_mce(struct mce *m) |
@@ -194,14 +194,14 @@ static void print_mce(struct mce *m) | |||
194 | m->cs, m->ip); | 194 | m->cs, m->ip); |
195 | if (m->cs == __KERNEL_CS) | 195 | if (m->cs == __KERNEL_CS) |
196 | print_symbol("{%s}", m->ip); | 196 | print_symbol("{%s}", m->ip); |
197 | printk("\n"); | 197 | printk(KERN_CONT "\n"); |
198 | } | 198 | } |
199 | printk(KERN_EMERG "TSC %llx ", m->tsc); | 199 | printk(KERN_EMERG "TSC %llx ", m->tsc); |
200 | if (m->addr) | 200 | if (m->addr) |
201 | printk("ADDR %llx ", m->addr); | 201 | printk(KERN_CONT "ADDR %llx ", m->addr); |
202 | if (m->misc) | 202 | if (m->misc) |
203 | printk("MISC %llx ", m->misc); | 203 | printk(KERN_CONT "MISC %llx ", m->misc); |
204 | printk("\n"); | 204 | printk(KERN_CONT "\n"); |
205 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 205 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
206 | m->cpuvendor, m->cpuid, m->time, m->socketid, | 206 | m->cpuvendor, m->cpuid, m->time, m->socketid, |
207 | m->apicid); | 207 | m->apicid); |
@@ -209,13 +209,13 @@ static void print_mce(struct mce *m) | |||
209 | 209 | ||
210 | static void print_mce_head(void) | 210 | static void print_mce_head(void) |
211 | { | 211 | { |
212 | printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); | 212 | printk(KERN_EMERG "\nHARDWARE ERROR\n"); |
213 | } | 213 | } |
214 | 214 | ||
215 | static void print_mce_tail(void) | 215 | static void print_mce_tail(void) |
216 | { | 216 | { |
217 | printk(KERN_EMERG "This is not a software problem!\n" | 217 | printk(KERN_EMERG "This is not a software problem!\n" |
218 | KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | 218 | "Run through mcelog --ascii to decode and contact your hardware vendor\n"); |
219 | } | 219 | } |
220 | 220 | ||
221 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 221 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
@@ -691,18 +691,21 @@ static atomic_t global_nwo; | |||
691 | * in the entry order. | 691 | * in the entry order. |
692 | * TBD double check parallel CPU hotunplug | 692 | * TBD double check parallel CPU hotunplug |
693 | */ | 693 | */ |
694 | static int mce_start(int no_way_out, int *order) | 694 | static int mce_start(int *no_way_out) |
695 | { | 695 | { |
696 | int nwo; | 696 | int order; |
697 | int cpus = num_online_cpus(); | 697 | int cpus = num_online_cpus(); |
698 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | 698 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; |
699 | 699 | ||
700 | if (!timeout) { | 700 | if (!timeout) |
701 | *order = -1; | 701 | return -1; |
702 | return no_way_out; | ||
703 | } | ||
704 | 702 | ||
705 | atomic_add(no_way_out, &global_nwo); | 703 | atomic_add(*no_way_out, &global_nwo); |
704 | /* | ||
705 | * global_nwo should be updated before mce_callin | ||
706 | */ | ||
707 | smp_wmb(); | ||
708 | order = atomic_add_return(1, &mce_callin); | ||
706 | 709 | ||
707 | /* | 710 | /* |
708 | * Wait for everyone. | 711 | * Wait for everyone. |
@@ -710,40 +713,43 @@ static int mce_start(int no_way_out, int *order) | |||
710 | while (atomic_read(&mce_callin) != cpus) { | 713 | while (atomic_read(&mce_callin) != cpus) { |
711 | if (mce_timed_out(&timeout)) { | 714 | if (mce_timed_out(&timeout)) { |
712 | atomic_set(&global_nwo, 0); | 715 | atomic_set(&global_nwo, 0); |
713 | *order = -1; | 716 | return -1; |
714 | return no_way_out; | ||
715 | } | 717 | } |
716 | ndelay(SPINUNIT); | 718 | ndelay(SPINUNIT); |
717 | } | 719 | } |
718 | 720 | ||
719 | /* | 721 | /* |
720 | * Cache the global no_way_out state. | 722 | * mce_callin should be read before global_nwo |
721 | */ | 723 | */ |
722 | nwo = atomic_read(&global_nwo); | 724 | smp_rmb(); |
723 | 725 | ||
724 | /* | 726 | if (order == 1) { |
725 | * Monarch starts executing now, the others wait. | 727 | /* |
726 | */ | 728 | * Monarch: Starts executing now, the others wait. |
727 | if (*order == 1) { | 729 | */ |
728 | atomic_set(&mce_executing, 1); | 730 | atomic_set(&mce_executing, 1); |
729 | return nwo; | 731 | } else { |
732 | /* | ||
733 | * Subject: Now start the scanning loop one by one in | ||
734 | * the original callin order. | ||
735 | * This way when there are any shared banks it will be | ||
736 | * only seen by one CPU before cleared, avoiding duplicates. | ||
737 | */ | ||
738 | while (atomic_read(&mce_executing) < order) { | ||
739 | if (mce_timed_out(&timeout)) { | ||
740 | atomic_set(&global_nwo, 0); | ||
741 | return -1; | ||
742 | } | ||
743 | ndelay(SPINUNIT); | ||
744 | } | ||
730 | } | 745 | } |
731 | 746 | ||
732 | /* | 747 | /* |
733 | * Now start the scanning loop one by one | 748 | * Cache the global no_way_out state. |
734 | * in the original callin order. | ||
735 | * This way when there are any shared banks it will | ||
736 | * be only seen by one CPU before cleared, avoiding duplicates. | ||
737 | */ | 749 | */ |
738 | while (atomic_read(&mce_executing) < *order) { | 750 | *no_way_out = atomic_read(&global_nwo); |
739 | if (mce_timed_out(&timeout)) { | 751 | |
740 | atomic_set(&global_nwo, 0); | 752 | return order; |
741 | *order = -1; | ||
742 | return no_way_out; | ||
743 | } | ||
744 | ndelay(SPINUNIT); | ||
745 | } | ||
746 | return nwo; | ||
747 | } | 753 | } |
748 | 754 | ||
749 | /* | 755 | /* |
@@ -863,7 +869,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
863 | * check handler. | 869 | * check handler. |
864 | */ | 870 | */ |
865 | int order; | 871 | int order; |
866 | |||
867 | /* | 872 | /* |
868 | * If no_way_out gets set, there is no safe way to recover from this | 873 | * If no_way_out gets set, there is no safe way to recover from this |
869 | * MCE. If tolerant is cranked up, we'll try anyway. | 874 | * MCE. If tolerant is cranked up, we'll try anyway. |
@@ -887,7 +892,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
887 | if (!banks) | 892 | if (!banks) |
888 | goto out; | 893 | goto out; |
889 | 894 | ||
890 | order = atomic_add_return(1, &mce_callin); | ||
891 | mce_setup(&m); | 895 | mce_setup(&m); |
892 | 896 | ||
893 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | 897 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); |
@@ -909,7 +913,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
909 | * This way we don't report duplicated events on shared banks | 913 | * This way we don't report duplicated events on shared banks |
910 | * because the first one to see it will clear it. | 914 | * because the first one to see it will clear it. |
911 | */ | 915 | */ |
912 | no_way_out = mce_start(no_way_out, &order); | 916 | order = mce_start(&no_way_out); |
913 | for (i = 0; i < banks; i++) { | 917 | for (i = 0; i < banks; i++) { |
914 | __clear_bit(i, toclear); | 918 | __clear_bit(i, toclear); |
915 | if (!bank[i]) | 919 | if (!bank[i]) |
@@ -1113,12 +1117,12 @@ static void mcheck_timer(unsigned long data) | |||
1113 | *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); | 1117 | *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); |
1114 | 1118 | ||
1115 | t->expires = jiffies + *n; | 1119 | t->expires = jiffies + *n; |
1116 | add_timer(t); | 1120 | add_timer_on(t, smp_processor_id()); |
1117 | } | 1121 | } |
1118 | 1122 | ||
1119 | static void mce_do_trigger(struct work_struct *work) | 1123 | static void mce_do_trigger(struct work_struct *work) |
1120 | { | 1124 | { |
1121 | call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); | 1125 | call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); |
1122 | } | 1126 | } |
1123 | 1127 | ||
1124 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | 1128 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); |
@@ -1135,7 +1139,7 @@ int mce_notify_irq(void) | |||
1135 | 1139 | ||
1136 | clear_thread_flag(TIF_MCE_NOTIFY); | 1140 | clear_thread_flag(TIF_MCE_NOTIFY); |
1137 | 1141 | ||
1138 | if (test_and_clear_bit(0, ¬ify_user)) { | 1142 | if (test_and_clear_bit(0, &mce_need_notify)) { |
1139 | wake_up_interruptible(&mce_wait); | 1143 | wake_up_interruptible(&mce_wait); |
1140 | 1144 | ||
1141 | /* | 1145 | /* |
@@ -1143,7 +1147,7 @@ int mce_notify_irq(void) | |||
1143 | * work_pending is always cleared before the function is | 1147 | * work_pending is always cleared before the function is |
1144 | * executed. | 1148 | * executed. |
1145 | */ | 1149 | */ |
1146 | if (trigger[0] && !work_pending(&mce_trigger_work)) | 1150 | if (mce_helper[0] && !work_pending(&mce_trigger_work)) |
1147 | schedule_work(&mce_trigger_work); | 1151 | schedule_work(&mce_trigger_work); |
1148 | 1152 | ||
1149 | if (__ratelimit(&ratelimit)) | 1153 | if (__ratelimit(&ratelimit)) |
@@ -1222,8 +1226,13 @@ static void mce_init(void) | |||
1222 | } | 1226 | } |
1223 | 1227 | ||
1224 | /* Add per CPU specific workarounds here */ | 1228 | /* Add per CPU specific workarounds here */ |
1225 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | 1229 | static int mce_cpu_quirks(struct cpuinfo_x86 *c) |
1226 | { | 1230 | { |
1231 | if (c->x86_vendor == X86_VENDOR_UNKNOWN) { | ||
1232 | pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); | ||
1233 | return -EOPNOTSUPP; | ||
1234 | } | ||
1235 | |||
1227 | /* This should be disabled by the BIOS, but isn't always */ | 1236 | /* This should be disabled by the BIOS, but isn't always */ |
1228 | if (c->x86_vendor == X86_VENDOR_AMD) { | 1237 | if (c->x86_vendor == X86_VENDOR_AMD) { |
1229 | if (c->x86 == 15 && banks > 4) { | 1238 | if (c->x86 == 15 && banks > 4) { |
@@ -1245,7 +1254,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1245 | * Various K7s with broken bank 0 around. Always disable | 1254 | * Various K7s with broken bank 0 around. Always disable |
1246 | * by default. | 1255 | * by default. |
1247 | */ | 1256 | */ |
1248 | if (c->x86 == 6) | 1257 | if (c->x86 == 6 && banks > 0) |
1249 | bank[0] = 0; | 1258 | bank[0] = 0; |
1250 | } | 1259 | } |
1251 | 1260 | ||
@@ -1269,11 +1278,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1269 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && | 1278 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && |
1270 | monarch_timeout < 0) | 1279 | monarch_timeout < 0) |
1271 | monarch_timeout = USEC_PER_SEC; | 1280 | monarch_timeout = USEC_PER_SEC; |
1281 | |||
1282 | /* | ||
1283 | * There are also broken BIOSes on some Pentium M and | ||
1284 | * earlier systems: | ||
1285 | */ | ||
1286 | if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) | ||
1287 | mce_bootlog = 0; | ||
1272 | } | 1288 | } |
1273 | if (monarch_timeout < 0) | 1289 | if (monarch_timeout < 0) |
1274 | monarch_timeout = 0; | 1290 | monarch_timeout = 0; |
1275 | if (mce_bootlog != 0) | 1291 | if (mce_bootlog != 0) |
1276 | mce_panic_timeout = 30; | 1292 | mce_panic_timeout = 30; |
1293 | |||
1294 | return 0; | ||
1277 | } | 1295 | } |
1278 | 1296 | ||
1279 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) | 1297 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) |
@@ -1282,8 +1300,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) | |||
1282 | return; | 1300 | return; |
1283 | switch (c->x86_vendor) { | 1301 | switch (c->x86_vendor) { |
1284 | case X86_VENDOR_INTEL: | 1302 | case X86_VENDOR_INTEL: |
1285 | if (mce_p5_enabled()) | 1303 | intel_p5_mcheck_init(c); |
1286 | intel_p5_mcheck_init(c); | ||
1287 | break; | 1304 | break; |
1288 | case X86_VENDOR_CENTAUR: | 1305 | case X86_VENDOR_CENTAUR: |
1289 | winchip_mcheck_init(c); | 1306 | winchip_mcheck_init(c); |
@@ -1318,7 +1335,7 @@ static void mce_init_timer(void) | |||
1318 | return; | 1335 | return; |
1319 | setup_timer(t, mcheck_timer, smp_processor_id()); | 1336 | setup_timer(t, mcheck_timer, smp_processor_id()); |
1320 | t->expires = round_jiffies(jiffies + *n); | 1337 | t->expires = round_jiffies(jiffies + *n); |
1321 | add_timer(t); | 1338 | add_timer_on(t, smp_processor_id()); |
1322 | } | 1339 | } |
1323 | 1340 | ||
1324 | /* | 1341 | /* |
@@ -1335,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | |||
1335 | if (!mce_available(c)) | 1352 | if (!mce_available(c)) |
1336 | return; | 1353 | return; |
1337 | 1354 | ||
1338 | if (mce_cap_init() < 0) { | 1355 | if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { |
1339 | mce_disabled = 1; | 1356 | mce_disabled = 1; |
1340 | return; | 1357 | return; |
1341 | } | 1358 | } |
1342 | mce_cpu_quirks(c); | ||
1343 | 1359 | ||
1344 | machine_check_vector = do_machine_check; | 1360 | machine_check_vector = do_machine_check; |
1345 | 1361 | ||
@@ -1609,8 +1625,9 @@ static int mce_resume(struct sys_device *dev) | |||
1609 | static void mce_cpu_restart(void *data) | 1625 | static void mce_cpu_restart(void *data) |
1610 | { | 1626 | { |
1611 | del_timer_sync(&__get_cpu_var(mce_timer)); | 1627 | del_timer_sync(&__get_cpu_var(mce_timer)); |
1612 | if (mce_available(¤t_cpu_data)) | 1628 | if (!mce_available(¤t_cpu_data)) |
1613 | mce_init(); | 1629 | return; |
1630 | mce_init(); | ||
1614 | mce_init_timer(); | 1631 | mce_init_timer(); |
1615 | } | 1632 | } |
1616 | 1633 | ||
@@ -1620,6 +1637,26 @@ static void mce_restart(void) | |||
1620 | on_each_cpu(mce_cpu_restart, NULL, 1); | 1637 | on_each_cpu(mce_cpu_restart, NULL, 1); |
1621 | } | 1638 | } |
1622 | 1639 | ||
1640 | /* Toggle features for corrected errors */ | ||
1641 | static void mce_disable_ce(void *all) | ||
1642 | { | ||
1643 | if (!mce_available(¤t_cpu_data)) | ||
1644 | return; | ||
1645 | if (all) | ||
1646 | del_timer_sync(&__get_cpu_var(mce_timer)); | ||
1647 | cmci_clear(); | ||
1648 | } | ||
1649 | |||
1650 | static void mce_enable_ce(void *all) | ||
1651 | { | ||
1652 | if (!mce_available(¤t_cpu_data)) | ||
1653 | return; | ||
1654 | cmci_reenable(); | ||
1655 | cmci_recheck(); | ||
1656 | if (all) | ||
1657 | mce_init_timer(); | ||
1658 | } | ||
1659 | |||
1623 | static struct sysdev_class mce_sysclass = { | 1660 | static struct sysdev_class mce_sysclass = { |
1624 | .suspend = mce_suspend, | 1661 | .suspend = mce_suspend, |
1625 | .shutdown = mce_shutdown, | 1662 | .shutdown = mce_shutdown, |
@@ -1659,26 +1696,70 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | |||
1659 | static ssize_t | 1696 | static ssize_t |
1660 | show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) | 1697 | show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) |
1661 | { | 1698 | { |
1662 | strcpy(buf, trigger); | 1699 | strcpy(buf, mce_helper); |
1663 | strcat(buf, "\n"); | 1700 | strcat(buf, "\n"); |
1664 | return strlen(trigger) + 1; | 1701 | return strlen(mce_helper) + 1; |
1665 | } | 1702 | } |
1666 | 1703 | ||
1667 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, | 1704 | static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, |
1668 | const char *buf, size_t siz) | 1705 | const char *buf, size_t siz) |
1669 | { | 1706 | { |
1670 | char *p; | 1707 | char *p; |
1671 | int len; | ||
1672 | 1708 | ||
1673 | strncpy(trigger, buf, sizeof(trigger)); | 1709 | strncpy(mce_helper, buf, sizeof(mce_helper)); |
1674 | trigger[sizeof(trigger)-1] = 0; | 1710 | mce_helper[sizeof(mce_helper)-1] = 0; |
1675 | len = strlen(trigger); | 1711 | p = strchr(mce_helper, '\n'); |
1676 | p = strchr(trigger, '\n'); | ||
1677 | 1712 | ||
1678 | if (*p) | 1713 | if (p) |
1679 | *p = 0; | 1714 | *p = 0; |
1680 | 1715 | ||
1681 | return len; | 1716 | return strlen(mce_helper) + !!p; |
1717 | } | ||
1718 | |||
1719 | static ssize_t set_ignore_ce(struct sys_device *s, | ||
1720 | struct sysdev_attribute *attr, | ||
1721 | const char *buf, size_t size) | ||
1722 | { | ||
1723 | u64 new; | ||
1724 | |||
1725 | if (strict_strtoull(buf, 0, &new) < 0) | ||
1726 | return -EINVAL; | ||
1727 | |||
1728 | if (mce_ignore_ce ^ !!new) { | ||
1729 | if (new) { | ||
1730 | /* disable ce features */ | ||
1731 | on_each_cpu(mce_disable_ce, (void *)1, 1); | ||
1732 | mce_ignore_ce = 1; | ||
1733 | } else { | ||
1734 | /* enable ce features */ | ||
1735 | mce_ignore_ce = 0; | ||
1736 | on_each_cpu(mce_enable_ce, (void *)1, 1); | ||
1737 | } | ||
1738 | } | ||
1739 | return size; | ||
1740 | } | ||
1741 | |||
1742 | static ssize_t set_cmci_disabled(struct sys_device *s, | ||
1743 | struct sysdev_attribute *attr, | ||
1744 | const char *buf, size_t size) | ||
1745 | { | ||
1746 | u64 new; | ||
1747 | |||
1748 | if (strict_strtoull(buf, 0, &new) < 0) | ||
1749 | return -EINVAL; | ||
1750 | |||
1751 | if (mce_cmci_disabled ^ !!new) { | ||
1752 | if (new) { | ||
1753 | /* disable cmci */ | ||
1754 | on_each_cpu(mce_disable_ce, NULL, 1); | ||
1755 | mce_cmci_disabled = 1; | ||
1756 | } else { | ||
1757 | /* enable cmci */ | ||
1758 | mce_cmci_disabled = 0; | ||
1759 | on_each_cpu(mce_enable_ce, NULL, 1); | ||
1760 | } | ||
1761 | } | ||
1762 | return size; | ||
1682 | } | 1763 | } |
1683 | 1764 | ||
1684 | static ssize_t store_int_with_restart(struct sys_device *s, | 1765 | static ssize_t store_int_with_restart(struct sys_device *s, |
@@ -1693,6 +1774,7 @@ static ssize_t store_int_with_restart(struct sys_device *s, | |||
1693 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | 1774 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); |
1694 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); | 1775 | static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); |
1695 | static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); | 1776 | static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); |
1777 | static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); | ||
1696 | 1778 | ||
1697 | static struct sysdev_ext_attribute attr_check_interval = { | 1779 | static struct sysdev_ext_attribute attr_check_interval = { |
1698 | _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, | 1780 | _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, |
@@ -1700,9 +1782,24 @@ static struct sysdev_ext_attribute attr_check_interval = { | |||
1700 | &check_interval | 1782 | &check_interval |
1701 | }; | 1783 | }; |
1702 | 1784 | ||
1785 | static struct sysdev_ext_attribute attr_ignore_ce = { | ||
1786 | _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), | ||
1787 | &mce_ignore_ce | ||
1788 | }; | ||
1789 | |||
1790 | static struct sysdev_ext_attribute attr_cmci_disabled = { | ||
1791 | _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), | ||
1792 | &mce_cmci_disabled | ||
1793 | }; | ||
1794 | |||
1703 | static struct sysdev_attribute *mce_attrs[] = { | 1795 | static struct sysdev_attribute *mce_attrs[] = { |
1704 | &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, | 1796 | &attr_tolerant.attr, |
1797 | &attr_check_interval.attr, | ||
1798 | &attr_trigger, | ||
1705 | &attr_monarch_timeout.attr, | 1799 | &attr_monarch_timeout.attr, |
1800 | &attr_dont_log_ce.attr, | ||
1801 | &attr_ignore_ce.attr, | ||
1802 | &attr_cmci_disabled.attr, | ||
1706 | NULL | 1803 | NULL |
1707 | }; | 1804 | }; |
1708 | 1805 | ||
@@ -1712,7 +1809,7 @@ static cpumask_var_t mce_dev_initialized; | |||
1712 | static __cpuinit int mce_create_device(unsigned int cpu) | 1809 | static __cpuinit int mce_create_device(unsigned int cpu) |
1713 | { | 1810 | { |
1714 | int err; | 1811 | int err; |
1715 | int i; | 1812 | int i, j; |
1716 | 1813 | ||
1717 | if (!mce_available(&boot_cpu_data)) | 1814 | if (!mce_available(&boot_cpu_data)) |
1718 | return -EIO; | 1815 | return -EIO; |
@@ -1730,9 +1827,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
1730 | if (err) | 1827 | if (err) |
1731 | goto error; | 1828 | goto error; |
1732 | } | 1829 | } |
1733 | for (i = 0; i < banks; i++) { | 1830 | for (j = 0; j < banks; j++) { |
1734 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), | 1831 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), |
1735 | &bank_attrs[i]); | 1832 | &bank_attrs[j]); |
1736 | if (err) | 1833 | if (err) |
1737 | goto error2; | 1834 | goto error2; |
1738 | } | 1835 | } |
@@ -1740,8 +1837,8 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
1740 | 1837 | ||
1741 | return 0; | 1838 | return 0; |
1742 | error2: | 1839 | error2: |
1743 | while (--i >= 0) | 1840 | while (--j >= 0) |
1744 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); | 1841 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); |
1745 | error: | 1842 | error: |
1746 | while (--i >= 0) | 1843 | while (--i >= 0) |
1747 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | 1844 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); |
@@ -1883,7 +1980,7 @@ static __init int mce_init_device(void) | |||
1883 | if (!mce_available(&boot_cpu_data)) | 1980 | if (!mce_available(&boot_cpu_data)) |
1884 | return -EIO; | 1981 | return -EIO; |
1885 | 1982 | ||
1886 | alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); | 1983 | zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); |
1887 | 1984 | ||
1888 | err = mce_init_banks(); | 1985 | err = mce_init_banks(); |
1889 | if (err) | 1986 | if (err) |
@@ -1915,7 +2012,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ | |||
1915 | /* This has to be run for each processor */ | 2012 | /* This has to be run for each processor */ |
1916 | void mcheck_init(struct cpuinfo_x86 *c) | 2013 | void mcheck_init(struct cpuinfo_x86 *c) |
1917 | { | 2014 | { |
1918 | if (mce_disabled == 1) | 2015 | if (mce_disabled) |
1919 | return; | 2016 | return; |
1920 | 2017 | ||
1921 | switch (c->x86_vendor) { | 2018 | switch (c->x86_vendor) { |
@@ -1945,10 +2042,9 @@ void mcheck_init(struct cpuinfo_x86 *c) | |||
1945 | 2042 | ||
1946 | static int __init mcheck_enable(char *str) | 2043 | static int __init mcheck_enable(char *str) |
1947 | { | 2044 | { |
1948 | mce_disabled = -1; | 2045 | mce_p5_enabled = 1; |
1949 | return 1; | 2046 | return 1; |
1950 | } | 2047 | } |
1951 | |||
1952 | __setup("mce", mcheck_enable); | 2048 | __setup("mce", mcheck_enable); |
1953 | 2049 | ||
1954 | #endif /* CONFIG_X86_OLD_MCE */ | 2050 | #endif /* CONFIG_X86_OLD_MCE */ |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h deleted file mode 100644 index 84a552b458c8..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ /dev/null | |||
@@ -1,38 +0,0 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <asm/mce.h> | ||
3 | |||
4 | #ifdef CONFIG_X86_OLD_MCE | ||
5 | void amd_mcheck_init(struct cpuinfo_x86 *c); | ||
6 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); | ||
7 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); | ||
8 | #endif | ||
9 | |||
10 | #ifdef CONFIG_X86_ANCIENT_MCE | ||
11 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | ||
12 | void winchip_mcheck_init(struct cpuinfo_x86 *c); | ||
13 | extern int mce_p5_enable; | ||
14 | static inline int mce_p5_enabled(void) { return mce_p5_enable; } | ||
15 | static inline void enable_p5_mce(void) { mce_p5_enable = 1; } | ||
16 | #else | ||
17 | static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} | ||
18 | static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} | ||
19 | static inline int mce_p5_enabled(void) { return 0; } | ||
20 | static inline void enable_p5_mce(void) { } | ||
21 | #endif | ||
22 | |||
23 | /* Call the installed machine check handler for this CPU setup. */ | ||
24 | extern void (*machine_check_vector)(struct pt_regs *, long error_code); | ||
25 | |||
26 | #ifdef CONFIG_X86_OLD_MCE | ||
27 | |||
28 | extern int nr_mce_banks; | ||
29 | |||
30 | void intel_set_thermal_handler(void); | ||
31 | |||
32 | #else | ||
33 | |||
34 | static inline void intel_set_thermal_handler(void) { } | ||
35 | |||
36 | #endif | ||
37 | |||
38 | void intel_init_thermal(struct cpuinfo_x86 *c); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bda..ddae21620bda 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 2b011d2d8579..e1acec0f7a32 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -1,74 +1,226 @@ | |||
1 | /* | 1 | /* |
2 | * Common code for Intel machine checks | 2 | * Intel specific MCE features. |
3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | ||
4 | * Copyright (C) 2008, 2009 Intel Corporation | ||
5 | * Author: Andi Kleen | ||
3 | */ | 6 | */ |
4 | #include <linux/interrupt.h> | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/smp.h> | ||
9 | 7 | ||
10 | #include <asm/therm_throt.h> | 8 | #include <linux/init.h> |
11 | #include <asm/processor.h> | 9 | #include <linux/interrupt.h> |
12 | #include <asm/system.h> | 10 | #include <linux/percpu.h> |
13 | #include <asm/apic.h> | 11 | #include <asm/apic.h> |
12 | #include <asm/processor.h> | ||
14 | #include <asm/msr.h> | 13 | #include <asm/msr.h> |
14 | #include <asm/mce.h> | ||
15 | |||
16 | /* | ||
17 | * Support for Intel Correct Machine Check Interrupts. This allows | ||
18 | * the CPU to raise an interrupt when a corrected machine check happened. | ||
19 | * Normally we pick those up using a regular polling timer. | ||
20 | * Also supports reliable discovery of shared banks. | ||
21 | */ | ||
15 | 22 | ||
16 | #include "mce.h" | 23 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); |
17 | 24 | ||
18 | void intel_init_thermal(struct cpuinfo_x86 *c) | 25 | /* |
26 | * cmci_discover_lock protects against parallel discovery attempts | ||
27 | * which could race against each other. | ||
28 | */ | ||
29 | static DEFINE_SPINLOCK(cmci_discover_lock); | ||
30 | |||
31 | #define CMCI_THRESHOLD 1 | ||
32 | |||
33 | static int cmci_supported(int *banks) | ||
19 | { | 34 | { |
20 | unsigned int cpu = smp_processor_id(); | 35 | u64 cap; |
21 | int tm2 = 0; | ||
22 | u32 l, h; | ||
23 | 36 | ||
24 | /* Thermal monitoring depends on ACPI and clock modulation*/ | 37 | if (mce_cmci_disabled || mce_ignore_ce) |
25 | if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) | 38 | return 0; |
26 | return; | ||
27 | 39 | ||
28 | /* | 40 | /* |
29 | * First check if its enabled already, in which case there might | 41 | * Vendor check is not strictly needed, but the initial |
30 | * be some SMM goo which handles it, so we can't even put a handler | 42 | * initialization is vendor keyed and this |
31 | * since it might be delivered via SMI already: | 43 | * makes sure none of the backdoors are entered otherwise. |
32 | */ | 44 | */ |
33 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | 45 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) |
34 | h = apic_read(APIC_LVTTHMR); | 46 | return 0; |
35 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | 47 | if (!cpu_has_apic || lapic_get_maxlvt() < 6) |
36 | printk(KERN_DEBUG | 48 | return 0; |
37 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | 49 | rdmsrl(MSR_IA32_MCG_CAP, cap); |
38 | return; | 50 | *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); |
51 | return !!(cap & MCG_CMCI_P); | ||
52 | } | ||
53 | |||
54 | /* | ||
55 | * The interrupt handler. This is called on every event. | ||
56 | * Just call the poller directly to log any events. | ||
57 | * This could in theory increase the threshold under high load, | ||
58 | * but doesn't for now. | ||
59 | */ | ||
60 | static void intel_threshold_interrupt(void) | ||
61 | { | ||
62 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
63 | mce_notify_irq(); | ||
64 | } | ||
65 | |||
66 | static void print_update(char *type, int *hdr, int num) | ||
67 | { | ||
68 | if (*hdr == 0) | ||
69 | printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); | ||
70 | *hdr = 1; | ||
71 | printk(KERN_CONT " %s:%d", type, num); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks | ||
76 | * on this CPU. Use the algorithm recommended in the SDM to discover shared | ||
77 | * banks. | ||
78 | */ | ||
79 | static void cmci_discover(int banks, int boot) | ||
80 | { | ||
81 | unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); | ||
82 | unsigned long flags; | ||
83 | int hdr = 0; | ||
84 | int i; | ||
85 | |||
86 | spin_lock_irqsave(&cmci_discover_lock, flags); | ||
87 | for (i = 0; i < banks; i++) { | ||
88 | u64 val; | ||
89 | |||
90 | if (test_bit(i, owned)) | ||
91 | continue; | ||
92 | |||
93 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
94 | |||
95 | /* Already owned by someone else? */ | ||
96 | if (val & CMCI_EN) { | ||
97 | if (test_and_clear_bit(i, owned) || boot) | ||
98 | print_update("SHD", &hdr, i); | ||
99 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
100 | continue; | ||
101 | } | ||
102 | |||
103 | val |= CMCI_EN | CMCI_THRESHOLD; | ||
104 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
105 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
106 | |||
107 | /* Did the enable bit stick? -- the bank supports CMCI */ | ||
108 | if (val & CMCI_EN) { | ||
109 | if (!test_and_set_bit(i, owned) || boot) | ||
110 | print_update("CMCI", &hdr, i); | ||
111 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
112 | } else { | ||
113 | WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); | ||
114 | } | ||
39 | } | 115 | } |
116 | spin_unlock_irqrestore(&cmci_discover_lock, flags); | ||
117 | if (hdr) | ||
118 | printk(KERN_CONT "\n"); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Just in case we missed an event during initialization check | ||
123 | * all the CMCI owned banks. | ||
124 | */ | ||
125 | void cmci_recheck(void) | ||
126 | { | ||
127 | unsigned long flags; | ||
128 | int banks; | ||
129 | |||
130 | if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) | ||
131 | return; | ||
132 | local_irq_save(flags); | ||
133 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
134 | local_irq_restore(flags); | ||
135 | } | ||
40 | 136 | ||
41 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | 137 | /* |
42 | tm2 = 1; | 138 | * Disable CMCI on this CPU for all banks it owns when it goes down. |
139 | * This allows other CPUs to claim the banks on rediscovery. | ||
140 | */ | ||
141 | void cmci_clear(void) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | int i; | ||
145 | int banks; | ||
146 | u64 val; | ||
43 | 147 | ||
44 | /* Check whether a vector already exists */ | 148 | if (!cmci_supported(&banks)) |
45 | if (h & APIC_VECTOR_MASK) { | ||
46 | printk(KERN_DEBUG | ||
47 | "CPU%d: Thermal LVT vector (%#x) already installed\n", | ||
48 | cpu, (h & APIC_VECTOR_MASK)); | ||
49 | return; | 149 | return; |
150 | spin_lock_irqsave(&cmci_discover_lock, flags); | ||
151 | for (i = 0; i < banks; i++) { | ||
152 | if (!test_bit(i, __get_cpu_var(mce_banks_owned))) | ||
153 | continue; | ||
154 | /* Disable CMCI */ | ||
155 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
156 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | ||
157 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
158 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | ||
50 | } | 159 | } |
160 | spin_unlock_irqrestore(&cmci_discover_lock, flags); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * After a CPU went down cycle through all the others and rediscover | ||
165 | * Must run in process context. | ||
166 | */ | ||
167 | void cmci_rediscover(int dying) | ||
168 | { | ||
169 | int banks; | ||
170 | int cpu; | ||
171 | cpumask_var_t old; | ||
172 | |||
173 | if (!cmci_supported(&banks)) | ||
174 | return; | ||
175 | if (!alloc_cpumask_var(&old, GFP_KERNEL)) | ||
176 | return; | ||
177 | cpumask_copy(old, ¤t->cpus_allowed); | ||
51 | 178 | ||
52 | /* We'll mask the thermal vector in the lapic till we're ready: */ | 179 | for_each_online_cpu(cpu) { |
53 | h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; | 180 | if (cpu == dying) |
54 | apic_write(APIC_LVTTHMR, h); | 181 | continue; |
182 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) | ||
183 | continue; | ||
184 | /* Recheck banks in case CPUs don't all have the same */ | ||
185 | if (cmci_supported(&banks)) | ||
186 | cmci_discover(banks, 0); | ||
187 | } | ||
55 | 188 | ||
56 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | 189 | set_cpus_allowed_ptr(current, old); |
57 | wrmsr(MSR_IA32_THERM_INTERRUPT, | 190 | free_cpumask_var(old); |
58 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | 191 | } |
59 | 192 | ||
60 | intel_set_thermal_handler(); | 193 | /* |
194 | * Reenable CMCI on this CPU in case a CPU down failed. | ||
195 | */ | ||
196 | void cmci_reenable(void) | ||
197 | { | ||
198 | int banks; | ||
199 | if (cmci_supported(&banks)) | ||
200 | cmci_discover(banks, 0); | ||
201 | } | ||
61 | 202 | ||
62 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | 203 | static void intel_init_cmci(void) |
63 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | 204 | { |
205 | int banks; | ||
64 | 206 | ||
65 | /* Unmask the thermal vector: */ | 207 | if (!cmci_supported(&banks)) |
66 | l = apic_read(APIC_LVTTHMR); | 208 | return; |
67 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
68 | 209 | ||
69 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | 210 | mce_threshold_vector = intel_threshold_interrupt; |
70 | cpu, tm2 ? "TM2" : "TM1"); | 211 | cmci_discover(banks, 1); |
212 | /* | ||
213 | * For CPU #0 this runs with still disabled APIC, but that's | ||
214 | * ok because only the vector is set up. We still do another | ||
215 | * check for the banks later for CPU #0 just to make sure | ||
216 | * to not miss any events. | ||
217 | */ | ||
218 | apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); | ||
219 | cmci_recheck(); | ||
220 | } | ||
71 | 221 | ||
72 | /* enable thermal throttle processing */ | 222 | void mce_intel_feature_init(struct cpuinfo_x86 *c) |
73 | atomic_set(&therm_throt_en, 1); | 223 | { |
224 | intel_init_thermal(c); | ||
225 | intel_init_cmci(); | ||
74 | } | 226 | } |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c deleted file mode 100644 index f2ef6952c400..000000000000 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ /dev/null | |||
@@ -1,248 +0,0 @@ | |||
1 | /* | ||
2 | * Intel specific MCE features. | ||
3 | * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> | ||
4 | * Copyright (C) 2008, 2009 Intel Corporation | ||
5 | * Author: Andi Kleen | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/interrupt.h> | ||
10 | #include <linux/percpu.h> | ||
11 | #include <asm/processor.h> | ||
12 | #include <asm/apic.h> | ||
13 | #include <asm/msr.h> | ||
14 | #include <asm/mce.h> | ||
15 | #include <asm/hw_irq.h> | ||
16 | #include <asm/idle.h> | ||
17 | #include <asm/therm_throt.h> | ||
18 | |||
19 | #include "mce.h" | ||
20 | |||
21 | asmlinkage void smp_thermal_interrupt(void) | ||
22 | { | ||
23 | __u64 msr_val; | ||
24 | |||
25 | ack_APIC_irq(); | ||
26 | |||
27 | exit_idle(); | ||
28 | irq_enter(); | ||
29 | |||
30 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | ||
31 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) | ||
32 | mce_log_therm_throt_event(msr_val); | ||
33 | |||
34 | inc_irq_stat(irq_thermal_count); | ||
35 | irq_exit(); | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * Support for Intel Correct Machine Check Interrupts. This allows | ||
40 | * the CPU to raise an interrupt when a corrected machine check happened. | ||
41 | * Normally we pick those up using a regular polling timer. | ||
42 | * Also supports reliable discovery of shared banks. | ||
43 | */ | ||
44 | |||
45 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); | ||
46 | |||
47 | /* | ||
48 | * cmci_discover_lock protects against parallel discovery attempts | ||
49 | * which could race against each other. | ||
50 | */ | ||
51 | static DEFINE_SPINLOCK(cmci_discover_lock); | ||
52 | |||
53 | #define CMCI_THRESHOLD 1 | ||
54 | |||
55 | static int cmci_supported(int *banks) | ||
56 | { | ||
57 | u64 cap; | ||
58 | |||
59 | if (mce_cmci_disabled || mce_ignore_ce) | ||
60 | return 0; | ||
61 | |||
62 | /* | ||
63 | * Vendor check is not strictly needed, but the initial | ||
64 | * initialization is vendor keyed and this | ||
65 | * makes sure none of the backdoors are entered otherwise. | ||
66 | */ | ||
67 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) | ||
68 | return 0; | ||
69 | if (!cpu_has_apic || lapic_get_maxlvt() < 6) | ||
70 | return 0; | ||
71 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
72 | *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); | ||
73 | return !!(cap & MCG_CMCI_P); | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * The interrupt handler. This is called on every event. | ||
78 | * Just call the poller directly to log any events. | ||
79 | * This could in theory increase the threshold under high load, | ||
80 | * but doesn't for now. | ||
81 | */ | ||
82 | static void intel_threshold_interrupt(void) | ||
83 | { | ||
84 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
85 | mce_notify_irq(); | ||
86 | } | ||
87 | |||
88 | static void print_update(char *type, int *hdr, int num) | ||
89 | { | ||
90 | if (*hdr == 0) | ||
91 | printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); | ||
92 | *hdr = 1; | ||
93 | printk(KERN_CONT " %s:%d", type, num); | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks | ||
98 | * on this CPU. Use the algorithm recommended in the SDM to discover shared | ||
99 | * banks. | ||
100 | */ | ||
101 | static void cmci_discover(int banks, int boot) | ||
102 | { | ||
103 | unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); | ||
104 | unsigned long flags; | ||
105 | int hdr = 0; | ||
106 | int i; | ||
107 | |||
108 | spin_lock_irqsave(&cmci_discover_lock, flags); | ||
109 | for (i = 0; i < banks; i++) { | ||
110 | u64 val; | ||
111 | |||
112 | if (test_bit(i, owned)) | ||
113 | continue; | ||
114 | |||
115 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
116 | |||
117 | /* Already owned by someone else? */ | ||
118 | if (val & CMCI_EN) { | ||
119 | if (test_and_clear_bit(i, owned) || boot) | ||
120 | print_update("SHD", &hdr, i); | ||
121 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
122 | continue; | ||
123 | } | ||
124 | |||
125 | val |= CMCI_EN | CMCI_THRESHOLD; | ||
126 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
127 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
128 | |||
129 | /* Did the enable bit stick? -- the bank supports CMCI */ | ||
130 | if (val & CMCI_EN) { | ||
131 | if (!test_and_set_bit(i, owned) || boot) | ||
132 | print_update("CMCI", &hdr, i); | ||
133 | __clear_bit(i, __get_cpu_var(mce_poll_banks)); | ||
134 | } else { | ||
135 | WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); | ||
136 | } | ||
137 | } | ||
138 | spin_unlock_irqrestore(&cmci_discover_lock, flags); | ||
139 | if (hdr) | ||
140 | printk(KERN_CONT "\n"); | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * Just in case we missed an event during initialization check | ||
145 | * all the CMCI owned banks. | ||
146 | */ | ||
147 | void cmci_recheck(void) | ||
148 | { | ||
149 | unsigned long flags; | ||
150 | int banks; | ||
151 | |||
152 | if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) | ||
153 | return; | ||
154 | local_irq_save(flags); | ||
155 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | ||
156 | local_irq_restore(flags); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Disable CMCI on this CPU for all banks it owns when it goes down. | ||
161 | * This allows other CPUs to claim the banks on rediscovery. | ||
162 | */ | ||
163 | void cmci_clear(void) | ||
164 | { | ||
165 | unsigned long flags; | ||
166 | int i; | ||
167 | int banks; | ||
168 | u64 val; | ||
169 | |||
170 | if (!cmci_supported(&banks)) | ||
171 | return; | ||
172 | spin_lock_irqsave(&cmci_discover_lock, flags); | ||
173 | for (i = 0; i < banks; i++) { | ||
174 | if (!test_bit(i, __get_cpu_var(mce_banks_owned))) | ||
175 | continue; | ||
176 | /* Disable CMCI */ | ||
177 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
178 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | ||
179 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | ||
180 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | ||
181 | } | ||
182 | spin_unlock_irqrestore(&cmci_discover_lock, flags); | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * After a CPU went down cycle through all the others and rediscover | ||
187 | * Must run in process context. | ||
188 | */ | ||
189 | void cmci_rediscover(int dying) | ||
190 | { | ||
191 | int banks; | ||
192 | int cpu; | ||
193 | cpumask_var_t old; | ||
194 | |||
195 | if (!cmci_supported(&banks)) | ||
196 | return; | ||
197 | if (!alloc_cpumask_var(&old, GFP_KERNEL)) | ||
198 | return; | ||
199 | cpumask_copy(old, ¤t->cpus_allowed); | ||
200 | |||
201 | for_each_online_cpu(cpu) { | ||
202 | if (cpu == dying) | ||
203 | continue; | ||
204 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) | ||
205 | continue; | ||
206 | /* Recheck banks in case CPUs don't all have the same */ | ||
207 | if (cmci_supported(&banks)) | ||
208 | cmci_discover(banks, 0); | ||
209 | } | ||
210 | |||
211 | set_cpus_allowed_ptr(current, old); | ||
212 | free_cpumask_var(old); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Reenable CMCI on this CPU in case a CPU down failed. | ||
217 | */ | ||
218 | void cmci_reenable(void) | ||
219 | { | ||
220 | int banks; | ||
221 | if (cmci_supported(&banks)) | ||
222 | cmci_discover(banks, 0); | ||
223 | } | ||
224 | |||
225 | static void intel_init_cmci(void) | ||
226 | { | ||
227 | int banks; | ||
228 | |||
229 | if (!cmci_supported(&banks)) | ||
230 | return; | ||
231 | |||
232 | mce_threshold_vector = intel_threshold_interrupt; | ||
233 | cmci_discover(banks, 1); | ||
234 | /* | ||
235 | * For CPU #0 this runs with still disabled APIC, but that's | ||
236 | * ok because only the vector is set up. We still do another | ||
237 | * check for the banks later for CPU #0 just to make sure | ||
238 | * to not miss any events. | ||
239 | */ | ||
240 | apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); | ||
241 | cmci_recheck(); | ||
242 | } | ||
243 | |||
244 | void mce_intel_feature_init(struct cpuinfo_x86 *c) | ||
245 | { | ||
246 | intel_init_thermal(c); | ||
247 | intel_init_cmci(); | ||
248 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 70b710420f74..f5f2d6f71fb6 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c | |||
@@ -17,10 +17,9 @@ | |||
17 | 17 | ||
18 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
19 | #include <asm/system.h> | 19 | #include <asm/system.h> |
20 | #include <asm/mce.h> | ||
20 | #include <asm/msr.h> | 21 | #include <asm/msr.h> |
21 | 22 | ||
22 | #include "mce.h" | ||
23 | |||
24 | static int firstbank; | 23 | static int firstbank; |
25 | 24 | ||
26 | #define MCE_RATE (15*HZ) /* timer rate is 15s */ | 25 | #define MCE_RATE (15*HZ) /* timer rate is 15s */ |
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 82cee108a2d3..4482aea9aa2e 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c | |||
@@ -1,21 +1,15 @@ | |||
1 | /* | 1 | /* |
2 | * P4 specific Machine Check Exception Reporting | 2 | * P4 specific Machine Check Exception Reporting |
3 | */ | 3 | */ |
4 | |||
5 | #include <linux/interrupt.h> | ||
6 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
7 | #include <linux/types.h> | 5 | #include <linux/types.h> |
8 | #include <linux/init.h> | 6 | #include <linux/init.h> |
9 | #include <linux/smp.h> | 7 | #include <linux/smp.h> |
10 | 8 | ||
11 | #include <asm/therm_throt.h> | ||
12 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
13 | #include <asm/system.h> | 10 | #include <asm/mce.h> |
14 | #include <asm/apic.h> | ||
15 | #include <asm/msr.h> | 11 | #include <asm/msr.h> |
16 | 12 | ||
17 | #include "mce.h" | ||
18 | |||
19 | /* as supported by the P4/Xeon family */ | 13 | /* as supported by the P4/Xeon family */ |
20 | struct intel_mce_extended_msrs { | 14 | struct intel_mce_extended_msrs { |
21 | u32 eax; | 15 | u32 eax; |
@@ -33,46 +27,6 @@ struct intel_mce_extended_msrs { | |||
33 | 27 | ||
34 | static int mce_num_extended_msrs; | 28 | static int mce_num_extended_msrs; |
35 | 29 | ||
36 | |||
37 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
38 | |||
39 | static void unexpected_thermal_interrupt(struct pt_regs *regs) | ||
40 | { | ||
41 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", | ||
42 | smp_processor_id()); | ||
43 | add_taint(TAINT_MACHINE_CHECK); | ||
44 | } | ||
45 | |||
46 | /* P4/Xeon Thermal transition interrupt handler: */ | ||
47 | static void intel_thermal_interrupt(struct pt_regs *regs) | ||
48 | { | ||
49 | __u64 msr_val; | ||
50 | |||
51 | ack_APIC_irq(); | ||
52 | |||
53 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | ||
54 | therm_throt_process(msr_val & THERM_STATUS_PROCHOT); | ||
55 | } | ||
56 | |||
57 | /* Thermal interrupt handler for this CPU setup: */ | ||
58 | static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = | ||
59 | unexpected_thermal_interrupt; | ||
60 | |||
61 | void smp_thermal_interrupt(struct pt_regs *regs) | ||
62 | { | ||
63 | irq_enter(); | ||
64 | vendor_thermal_interrupt(regs); | ||
65 | __get_cpu_var(irq_stat).irq_thermal_count++; | ||
66 | irq_exit(); | ||
67 | } | ||
68 | |||
69 | void intel_set_thermal_handler(void) | ||
70 | { | ||
71 | vendor_thermal_interrupt = intel_thermal_interrupt; | ||
72 | } | ||
73 | |||
74 | #endif /* CONFIG_X86_MCE_P4THERMAL */ | ||
75 | |||
76 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ | 30 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ |
77 | static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | 31 | static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) |
78 | { | 32 | { |
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 015f481ab1b0..5c0e6533d9bc 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c | |||
@@ -10,12 +10,11 @@ | |||
10 | 10 | ||
11 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 12 | #include <asm/system.h> |
13 | #include <asm/mce.h> | ||
13 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
14 | 15 | ||
15 | #include "mce.h" | ||
16 | |||
17 | /* By default disabled */ | 16 | /* By default disabled */ |
18 | int mce_p5_enable; | 17 | int mce_p5_enabled __read_mostly; |
19 | 18 | ||
20 | /* Machine check handler for Pentium class Intel CPUs: */ | 19 | /* Machine check handler for Pentium class Intel CPUs: */ |
21 | static void pentium_machine_check(struct pt_regs *regs, long error_code) | 20 | static void pentium_machine_check(struct pt_regs *regs, long error_code) |
@@ -43,15 +42,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) | |||
43 | { | 42 | { |
44 | u32 l, h; | 43 | u32 l, h; |
45 | 44 | ||
46 | /* Check for MCE support: */ | 45 | /* Default P5 to off as its often misconnected: */ |
47 | if (!cpu_has(c, X86_FEATURE_MCE)) | 46 | if (!mce_p5_enabled) |
48 | return; | 47 | return; |
49 | 48 | ||
50 | #ifdef CONFIG_X86_OLD_MCE | 49 | /* Check for MCE support: */ |
51 | /* Default P5 to off as its often misconnected: */ | 50 | if (!cpu_has(c, X86_FEATURE_MCE)) |
52 | if (mce_disabled != -1) | ||
53 | return; | 51 | return; |
54 | #endif | ||
55 | 52 | ||
56 | machine_check_vector = pentium_machine_check; | 53 | machine_check_vector = pentium_machine_check; |
57 | /* Make sure the vector pointer is visible before we enable MCEs: */ | 54 | /* Make sure the vector pointer is visible before we enable MCEs: */ |
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index 43c24e667457..01e4f8178183 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c | |||
@@ -10,10 +10,9 @@ | |||
10 | 10 | ||
11 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
12 | #include <asm/system.h> | 12 | #include <asm/system.h> |
13 | #include <asm/mce.h> | ||
13 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
14 | 15 | ||
15 | #include "mce.h" | ||
16 | |||
17 | /* Machine Check Handler For PII/PIII */ | 16 | /* Machine Check Handler For PII/PIII */ |
18 | static void intel_machine_check(struct pt_regs *regs, long error_code) | 17 | static void intel_machine_check(struct pt_regs *regs, long error_code) |
19 | { | 18 | { |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7b1ae2e20ba5..5957a93e5173 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -13,21 +13,32 @@ | |||
13 | * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. | 13 | * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. |
14 | * Inspired by Ross Biro's and Al Borchers' counter code. | 14 | * Inspired by Ross Biro's and Al Borchers' counter code. |
15 | */ | 15 | */ |
16 | #include <linux/interrupt.h> | ||
16 | #include <linux/notifier.h> | 17 | #include <linux/notifier.h> |
17 | #include <linux/jiffies.h> | 18 | #include <linux/jiffies.h> |
19 | #include <linux/kernel.h> | ||
18 | #include <linux/percpu.h> | 20 | #include <linux/percpu.h> |
19 | #include <linux/sysdev.h> | 21 | #include <linux/sysdev.h> |
22 | #include <linux/types.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/smp.h> | ||
20 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
21 | 26 | ||
22 | #include <asm/therm_throt.h> | 27 | #include <asm/processor.h> |
28 | #include <asm/system.h> | ||
29 | #include <asm/apic.h> | ||
30 | #include <asm/idle.h> | ||
31 | #include <asm/mce.h> | ||
32 | #include <asm/msr.h> | ||
23 | 33 | ||
24 | /* How long to wait between reporting thermal events */ | 34 | /* How long to wait between reporting thermal events */ |
25 | #define CHECK_INTERVAL (300 * HZ) | 35 | #define CHECK_INTERVAL (300 * HZ) |
26 | 36 | ||
27 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; | 37 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; |
28 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); | 38 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); |
39 | static DEFINE_PER_CPU(bool, thermal_throttle_active); | ||
29 | 40 | ||
30 | atomic_t therm_throt_en = ATOMIC_INIT(0); | 41 | static atomic_t therm_throt_en = ATOMIC_INIT(0); |
31 | 42 | ||
32 | #ifdef CONFIG_SYSFS | 43 | #ifdef CONFIG_SYSFS |
33 | #define define_therm_throt_sysdev_one_ro(_name) \ | 44 | #define define_therm_throt_sysdev_one_ro(_name) \ |
@@ -82,31 +93,37 @@ static struct attribute_group thermal_throttle_attr_group = { | |||
82 | * 1 : Event should be logged further, and a message has been | 93 | * 1 : Event should be logged further, and a message has been |
83 | * printed to the syslog. | 94 | * printed to the syslog. |
84 | */ | 95 | */ |
85 | int therm_throt_process(int curr) | 96 | static int therm_throt_process(int curr) |
86 | { | 97 | { |
87 | unsigned int cpu = smp_processor_id(); | 98 | unsigned int cpu = smp_processor_id(); |
88 | __u64 tmp_jiffs = get_jiffies_64(); | 99 | __u64 tmp_jiffs = get_jiffies_64(); |
100 | bool was_throttled = __get_cpu_var(thermal_throttle_active); | ||
101 | bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; | ||
89 | 102 | ||
90 | if (curr) | 103 | if (is_throttled) |
91 | __get_cpu_var(thermal_throttle_count)++; | 104 | __get_cpu_var(thermal_throttle_count)++; |
92 | 105 | ||
93 | if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) | 106 | if (!(was_throttled ^ is_throttled) && |
107 | time_before64(tmp_jiffs, __get_cpu_var(next_check))) | ||
94 | return 0; | 108 | return 0; |
95 | 109 | ||
96 | __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; | 110 | __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; |
97 | 111 | ||
98 | /* if we just entered the thermal event */ | 112 | /* if we just entered the thermal event */ |
99 | if (curr) { | 113 | if (is_throttled) { |
100 | printk(KERN_CRIT "CPU%d: Temperature above threshold, " | 114 | printk(KERN_CRIT "CPU%d: Temperature above threshold, " |
101 | "cpu clock throttled (total events = %lu)\n", cpu, | 115 | "cpu clock throttled (total events = %lu)\n", |
102 | __get_cpu_var(thermal_throttle_count)); | 116 | cpu, __get_cpu_var(thermal_throttle_count)); |
103 | 117 | ||
104 | add_taint(TAINT_MACHINE_CHECK); | 118 | add_taint(TAINT_MACHINE_CHECK); |
105 | } else { | 119 | return 1; |
106 | printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); | 120 | } |
121 | if (was_throttled) { | ||
122 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); | ||
123 | return 1; | ||
107 | } | 124 | } |
108 | 125 | ||
109 | return 1; | 126 | return 0; |
110 | } | 127 | } |
111 | 128 | ||
112 | #ifdef CONFIG_SYSFS | 129 | #ifdef CONFIG_SYSFS |
@@ -186,6 +203,94 @@ static __init int thermal_throttle_init_device(void) | |||
186 | 203 | ||
187 | return 0; | 204 | return 0; |
188 | } | 205 | } |
189 | |||
190 | device_initcall(thermal_throttle_init_device); | 206 | device_initcall(thermal_throttle_init_device); |
207 | |||
191 | #endif /* CONFIG_SYSFS */ | 208 | #endif /* CONFIG_SYSFS */ |
209 | |||
210 | /* Thermal transition interrupt handler */ | ||
211 | static void intel_thermal_interrupt(void) | ||
212 | { | ||
213 | __u64 msr_val; | ||
214 | |||
215 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | ||
216 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) | ||
217 | mce_log_therm_throt_event(msr_val); | ||
218 | } | ||
219 | |||
220 | static void unexpected_thermal_interrupt(void) | ||
221 | { | ||
222 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", | ||
223 | smp_processor_id()); | ||
224 | add_taint(TAINT_MACHINE_CHECK); | ||
225 | } | ||
226 | |||
227 | static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; | ||
228 | |||
229 | asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) | ||
230 | { | ||
231 | exit_idle(); | ||
232 | irq_enter(); | ||
233 | inc_irq_stat(irq_thermal_count); | ||
234 | smp_thermal_vector(); | ||
235 | irq_exit(); | ||
236 | /* Ack only at the end to avoid potential reentry */ | ||
237 | ack_APIC_irq(); | ||
238 | } | ||
239 | |||
240 | void intel_init_thermal(struct cpuinfo_x86 *c) | ||
241 | { | ||
242 | unsigned int cpu = smp_processor_id(); | ||
243 | int tm2 = 0; | ||
244 | u32 l, h; | ||
245 | |||
246 | /* Thermal monitoring depends on ACPI and clock modulation*/ | ||
247 | if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) | ||
248 | return; | ||
249 | |||
250 | /* | ||
251 | * First check if its enabled already, in which case there might | ||
252 | * be some SMM goo which handles it, so we can't even put a handler | ||
253 | * since it might be delivered via SMI already: | ||
254 | */ | ||
255 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
256 | h = apic_read(APIC_LVTTHMR); | ||
257 | if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { | ||
258 | printk(KERN_DEBUG | ||
259 | "CPU%d: Thermal monitoring handled by SMI\n", cpu); | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | ||
264 | tm2 = 1; | ||
265 | |||
266 | /* Check whether a vector already exists */ | ||
267 | if (h & APIC_VECTOR_MASK) { | ||
268 | printk(KERN_DEBUG | ||
269 | "CPU%d: Thermal LVT vector (%#x) already installed\n", | ||
270 | cpu, (h & APIC_VECTOR_MASK)); | ||
271 | return; | ||
272 | } | ||
273 | |||
274 | /* We'll mask the thermal vector in the lapic till we're ready: */ | ||
275 | h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; | ||
276 | apic_write(APIC_LVTTHMR, h); | ||
277 | |||
278 | rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); | ||
279 | wrmsr(MSR_IA32_THERM_INTERRUPT, | ||
280 | l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); | ||
281 | |||
282 | smp_thermal_vector = intel_thermal_interrupt; | ||
283 | |||
284 | rdmsr(MSR_IA32_MISC_ENABLE, l, h); | ||
285 | wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); | ||
286 | |||
287 | /* Unmask the thermal vector: */ | ||
288 | l = apic_read(APIC_LVTTHMR); | ||
289 | apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); | ||
290 | |||
291 | printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", | ||
292 | cpu, tm2 ? "TM2" : "TM1"); | ||
293 | |||
294 | /* enable thermal throttle processing */ | ||
295 | atomic_set(&therm_throt_en, 1); | ||
296 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 81b02487090b..54060f565974 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c | |||
@@ -9,10 +9,9 @@ | |||
9 | 9 | ||
10 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
11 | #include <asm/system.h> | 11 | #include <asm/system.h> |
12 | #include <asm/mce.h> | ||
12 | #include <asm/msr.h> | 13 | #include <asm/msr.h> |
13 | 14 | ||
14 | #include "mce.h" | ||
15 | |||
16 | /* Machine check handler for WinChip C6: */ | 15 | /* Machine check handler for WinChip C6: */ |
17 | static void winchip_machine_check(struct pt_regs *regs, long error_code) | 16 | static void winchip_machine_check(struct pt_regs *regs, long error_code) |
18 | { | 17 | { |
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 275bc142cd5d..900332b800f8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/kdebug.h> | 19 | #include <linux/kdebug.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
22 | #include <linux/highmem.h> | ||
22 | 23 | ||
23 | #include <asm/apic.h> | 24 | #include <asm/apic.h> |
24 | #include <asm/stacktrace.h> | 25 | #include <asm/stacktrace.h> |
@@ -54,6 +55,7 @@ struct x86_pmu { | |||
54 | int num_counters_fixed; | 55 | int num_counters_fixed; |
55 | int counter_bits; | 56 | int counter_bits; |
56 | u64 counter_mask; | 57 | u64 counter_mask; |
58 | int apic; | ||
57 | u64 max_period; | 59 | u64 max_period; |
58 | u64 intel_ctrl; | 60 | u64 intel_ctrl; |
59 | }; | 61 | }; |
@@ -65,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { | |||
65 | }; | 67 | }; |
66 | 68 | ||
67 | /* | 69 | /* |
70 | * Not sure about some of these | ||
71 | */ | ||
72 | static const u64 p6_perfmon_event_map[] = | ||
73 | { | ||
74 | [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, | ||
75 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, | ||
76 | [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, | ||
77 | [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, | ||
78 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, | ||
79 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, | ||
80 | [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, | ||
81 | }; | ||
82 | |||
83 | static u64 p6_pmu_event_map(int event) | ||
84 | { | ||
85 | return p6_perfmon_event_map[event]; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Counter setting that is specified not to count anything. | ||
90 | * We use this to effectively disable a counter. | ||
91 | * | ||
92 | * L2_RQSTS with 0 MESI unit mask. | ||
93 | */ | ||
94 | #define P6_NOP_COUNTER 0x0000002EULL | ||
95 | |||
96 | static u64 p6_pmu_raw_event(u64 event) | ||
97 | { | ||
98 | #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL | ||
99 | #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL | ||
100 | #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL | ||
101 | #define P6_EVNTSEL_INV_MASK 0x00800000ULL | ||
102 | #define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL | ||
103 | |||
104 | #define P6_EVNTSEL_MASK \ | ||
105 | (P6_EVNTSEL_EVENT_MASK | \ | ||
106 | P6_EVNTSEL_UNIT_MASK | \ | ||
107 | P6_EVNTSEL_EDGE_MASK | \ | ||
108 | P6_EVNTSEL_INV_MASK | \ | ||
109 | P6_EVNTSEL_COUNTER_MASK) | ||
110 | |||
111 | return event & P6_EVNTSEL_MASK; | ||
112 | } | ||
113 | |||
114 | |||
115 | /* | ||
68 | * Intel PerfMon v3. Used on Core2 and later. | 116 | * Intel PerfMon v3. Used on Core2 and later. |
69 | */ | 117 | */ |
70 | static const u64 intel_perfmon_event_map[] = | 118 | static const u64 intel_perfmon_event_map[] = |
@@ -389,23 +437,23 @@ static u64 intel_pmu_raw_event(u64 event) | |||
389 | return event & CORE_EVNTSEL_MASK; | 437 | return event & CORE_EVNTSEL_MASK; |
390 | } | 438 | } |
391 | 439 | ||
392 | static const u64 amd_0f_hw_cache_event_ids | 440 | static const u64 amd_hw_cache_event_ids |
393 | [PERF_COUNT_HW_CACHE_MAX] | 441 | [PERF_COUNT_HW_CACHE_MAX] |
394 | [PERF_COUNT_HW_CACHE_OP_MAX] | 442 | [PERF_COUNT_HW_CACHE_OP_MAX] |
395 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = | 443 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = |
396 | { | 444 | { |
397 | [ C(L1D) ] = { | 445 | [ C(L1D) ] = { |
398 | [ C(OP_READ) ] = { | 446 | [ C(OP_READ) ] = { |
399 | [ C(RESULT_ACCESS) ] = 0, | 447 | [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ |
400 | [ C(RESULT_MISS) ] = 0, | 448 | [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ |
401 | }, | 449 | }, |
402 | [ C(OP_WRITE) ] = { | 450 | [ C(OP_WRITE) ] = { |
403 | [ C(RESULT_ACCESS) ] = 0, | 451 | [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ |
404 | [ C(RESULT_MISS) ] = 0, | 452 | [ C(RESULT_MISS) ] = 0, |
405 | }, | 453 | }, |
406 | [ C(OP_PREFETCH) ] = { | 454 | [ C(OP_PREFETCH) ] = { |
407 | [ C(RESULT_ACCESS) ] = 0, | 455 | [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ |
408 | [ C(RESULT_MISS) ] = 0, | 456 | [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ |
409 | }, | 457 | }, |
410 | }, | 458 | }, |
411 | [ C(L1I ) ] = { | 459 | [ C(L1I ) ] = { |
@@ -418,17 +466,17 @@ static const u64 amd_0f_hw_cache_event_ids | |||
418 | [ C(RESULT_MISS) ] = -1, | 466 | [ C(RESULT_MISS) ] = -1, |
419 | }, | 467 | }, |
420 | [ C(OP_PREFETCH) ] = { | 468 | [ C(OP_PREFETCH) ] = { |
421 | [ C(RESULT_ACCESS) ] = 0, | 469 | [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ |
422 | [ C(RESULT_MISS) ] = 0, | 470 | [ C(RESULT_MISS) ] = 0, |
423 | }, | 471 | }, |
424 | }, | 472 | }, |
425 | [ C(LL ) ] = { | 473 | [ C(LL ) ] = { |
426 | [ C(OP_READ) ] = { | 474 | [ C(OP_READ) ] = { |
427 | [ C(RESULT_ACCESS) ] = 0, | 475 | [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ |
428 | [ C(RESULT_MISS) ] = 0, | 476 | [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ |
429 | }, | 477 | }, |
430 | [ C(OP_WRITE) ] = { | 478 | [ C(OP_WRITE) ] = { |
431 | [ C(RESULT_ACCESS) ] = 0, | 479 | [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ |
432 | [ C(RESULT_MISS) ] = 0, | 480 | [ C(RESULT_MISS) ] = 0, |
433 | }, | 481 | }, |
434 | [ C(OP_PREFETCH) ] = { | 482 | [ C(OP_PREFETCH) ] = { |
@@ -438,8 +486,8 @@ static const u64 amd_0f_hw_cache_event_ids | |||
438 | }, | 486 | }, |
439 | [ C(DTLB) ] = { | 487 | [ C(DTLB) ] = { |
440 | [ C(OP_READ) ] = { | 488 | [ C(OP_READ) ] = { |
441 | [ C(RESULT_ACCESS) ] = 0, | 489 | [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ |
442 | [ C(RESULT_MISS) ] = 0, | 490 | [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ |
443 | }, | 491 | }, |
444 | [ C(OP_WRITE) ] = { | 492 | [ C(OP_WRITE) ] = { |
445 | [ C(RESULT_ACCESS) ] = 0, | 493 | [ C(RESULT_ACCESS) ] = 0, |
@@ -566,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); | |||
566 | 614 | ||
567 | static bool reserve_pmc_hardware(void) | 615 | static bool reserve_pmc_hardware(void) |
568 | { | 616 | { |
617 | #ifdef CONFIG_X86_LOCAL_APIC | ||
569 | int i; | 618 | int i; |
570 | 619 | ||
571 | if (nmi_watchdog == NMI_LOCAL_APIC) | 620 | if (nmi_watchdog == NMI_LOCAL_APIC) |
@@ -580,9 +629,11 @@ static bool reserve_pmc_hardware(void) | |||
580 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) | 629 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) |
581 | goto eventsel_fail; | 630 | goto eventsel_fail; |
582 | } | 631 | } |
632 | #endif | ||
583 | 633 | ||
584 | return true; | 634 | return true; |
585 | 635 | ||
636 | #ifdef CONFIG_X86_LOCAL_APIC | ||
586 | eventsel_fail: | 637 | eventsel_fail: |
587 | for (i--; i >= 0; i--) | 638 | for (i--; i >= 0; i--) |
588 | release_evntsel_nmi(x86_pmu.eventsel + i); | 639 | release_evntsel_nmi(x86_pmu.eventsel + i); |
@@ -597,10 +648,12 @@ perfctr_fail: | |||
597 | enable_lapic_nmi_watchdog(); | 648 | enable_lapic_nmi_watchdog(); |
598 | 649 | ||
599 | return false; | 650 | return false; |
651 | #endif | ||
600 | } | 652 | } |
601 | 653 | ||
602 | static void release_pmc_hardware(void) | 654 | static void release_pmc_hardware(void) |
603 | { | 655 | { |
656 | #ifdef CONFIG_X86_LOCAL_APIC | ||
604 | int i; | 657 | int i; |
605 | 658 | ||
606 | for (i = 0; i < x86_pmu.num_counters; i++) { | 659 | for (i = 0; i < x86_pmu.num_counters; i++) { |
@@ -610,6 +663,7 @@ static void release_pmc_hardware(void) | |||
610 | 663 | ||
611 | if (nmi_watchdog == NMI_LOCAL_APIC) | 664 | if (nmi_watchdog == NMI_LOCAL_APIC) |
612 | enable_lapic_nmi_watchdog(); | 665 | enable_lapic_nmi_watchdog(); |
666 | #endif | ||
613 | } | 667 | } |
614 | 668 | ||
615 | static void hw_perf_counter_destroy(struct perf_counter *counter) | 669 | static void hw_perf_counter_destroy(struct perf_counter *counter) |
@@ -665,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
665 | { | 719 | { |
666 | struct perf_counter_attr *attr = &counter->attr; | 720 | struct perf_counter_attr *attr = &counter->attr; |
667 | struct hw_perf_counter *hwc = &counter->hw; | 721 | struct hw_perf_counter *hwc = &counter->hw; |
722 | u64 config; | ||
668 | int err; | 723 | int err; |
669 | 724 | ||
670 | if (!x86_pmu_initialized()) | 725 | if (!x86_pmu_initialized()) |
@@ -700,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
700 | hwc->sample_period = x86_pmu.max_period; | 755 | hwc->sample_period = x86_pmu.max_period; |
701 | hwc->last_period = hwc->sample_period; | 756 | hwc->last_period = hwc->sample_period; |
702 | atomic64_set(&hwc->period_left, hwc->sample_period); | 757 | atomic64_set(&hwc->period_left, hwc->sample_period); |
758 | } else { | ||
759 | /* | ||
760 | * If we have a PMU initialized but no APIC | ||
761 | * interrupts, we cannot sample hardware | ||
762 | * counters (user-space has to fall back and | ||
763 | * sample via a hrtimer based software counter): | ||
764 | */ | ||
765 | if (!x86_pmu.apic) | ||
766 | return -EOPNOTSUPP; | ||
703 | } | 767 | } |
704 | 768 | ||
705 | counter->destroy = hw_perf_counter_destroy; | 769 | counter->destroy = hw_perf_counter_destroy; |
@@ -717,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
717 | 781 | ||
718 | if (attr->config >= x86_pmu.max_events) | 782 | if (attr->config >= x86_pmu.max_events) |
719 | return -EINVAL; | 783 | return -EINVAL; |
784 | |||
720 | /* | 785 | /* |
721 | * The generic map: | 786 | * The generic map: |
722 | */ | 787 | */ |
723 | hwc->config |= x86_pmu.event_map(attr->config); | 788 | config = x86_pmu.event_map(attr->config); |
789 | |||
790 | if (config == 0) | ||
791 | return -ENOENT; | ||
792 | |||
793 | if (config == -1LL) | ||
794 | return -EINVAL; | ||
795 | |||
796 | hwc->config |= config; | ||
724 | 797 | ||
725 | return 0; | 798 | return 0; |
726 | } | 799 | } |
727 | 800 | ||
801 | static void p6_pmu_disable_all(void) | ||
802 | { | ||
803 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
804 | u64 val; | ||
805 | |||
806 | if (!cpuc->enabled) | ||
807 | return; | ||
808 | |||
809 | cpuc->enabled = 0; | ||
810 | barrier(); | ||
811 | |||
812 | /* p6 only has one enable register */ | ||
813 | rdmsrl(MSR_P6_EVNTSEL0, val); | ||
814 | val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
815 | wrmsrl(MSR_P6_EVNTSEL0, val); | ||
816 | } | ||
817 | |||
728 | static void intel_pmu_disable_all(void) | 818 | static void intel_pmu_disable_all(void) |
729 | { | 819 | { |
730 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); | 820 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); |
@@ -766,6 +856,23 @@ void hw_perf_disable(void) | |||
766 | return x86_pmu.disable_all(); | 856 | return x86_pmu.disable_all(); |
767 | } | 857 | } |
768 | 858 | ||
859 | static void p6_pmu_enable_all(void) | ||
860 | { | ||
861 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
862 | unsigned long val; | ||
863 | |||
864 | if (cpuc->enabled) | ||
865 | return; | ||
866 | |||
867 | cpuc->enabled = 1; | ||
868 | barrier(); | ||
869 | |||
870 | /* p6 only has one enable register */ | ||
871 | rdmsrl(MSR_P6_EVNTSEL0, val); | ||
872 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
873 | wrmsrl(MSR_P6_EVNTSEL0, val); | ||
874 | } | ||
875 | |||
769 | static void intel_pmu_enable_all(void) | 876 | static void intel_pmu_enable_all(void) |
770 | { | 877 | { |
771 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | 878 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); |
@@ -783,13 +890,13 @@ static void amd_pmu_enable_all(void) | |||
783 | barrier(); | 890 | barrier(); |
784 | 891 | ||
785 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 892 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
893 | struct perf_counter *counter = cpuc->counters[idx]; | ||
786 | u64 val; | 894 | u64 val; |
787 | 895 | ||
788 | if (!test_bit(idx, cpuc->active_mask)) | 896 | if (!test_bit(idx, cpuc->active_mask)) |
789 | continue; | 897 | continue; |
790 | rdmsrl(MSR_K7_EVNTSEL0 + idx, val); | 898 | |
791 | if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) | 899 | val = counter->hw.config; |
792 | continue; | ||
793 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | 900 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; |
794 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | 901 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); |
795 | } | 902 | } |
@@ -818,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack) | |||
818 | 925 | ||
819 | static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 926 | static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) |
820 | { | 927 | { |
821 | int err; | 928 | (void)checking_wrmsrl(hwc->config_base + idx, |
822 | err = checking_wrmsrl(hwc->config_base + idx, | ||
823 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); | 929 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); |
824 | } | 930 | } |
825 | 931 | ||
826 | static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | 932 | static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) |
827 | { | 933 | { |
828 | int err; | 934 | (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); |
829 | err = checking_wrmsrl(hwc->config_base + idx, | ||
830 | hwc->config); | ||
831 | } | 935 | } |
832 | 936 | ||
833 | static inline void | 937 | static inline void |
@@ -835,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) | |||
835 | { | 939 | { |
836 | int idx = __idx - X86_PMC_IDX_FIXED; | 940 | int idx = __idx - X86_PMC_IDX_FIXED; |
837 | u64 ctrl_val, mask; | 941 | u64 ctrl_val, mask; |
838 | int err; | ||
839 | 942 | ||
840 | mask = 0xfULL << (idx * 4); | 943 | mask = 0xfULL << (idx * 4); |
841 | 944 | ||
842 | rdmsrl(hwc->config_base, ctrl_val); | 945 | rdmsrl(hwc->config_base, ctrl_val); |
843 | ctrl_val &= ~mask; | 946 | ctrl_val &= ~mask; |
844 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | 947 | (void)checking_wrmsrl(hwc->config_base, ctrl_val); |
948 | } | ||
949 | |||
950 | static inline void | ||
951 | p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | ||
952 | { | ||
953 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
954 | u64 val = P6_NOP_COUNTER; | ||
955 | |||
956 | if (cpuc->enabled) | ||
957 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
958 | |||
959 | (void)checking_wrmsrl(hwc->config_base + idx, val); | ||
845 | } | 960 | } |
846 | 961 | ||
847 | static inline void | 962 | static inline void |
@@ -911,6 +1026,8 @@ x86_perf_counter_set_period(struct perf_counter *counter, | |||
911 | err = checking_wrmsrl(hwc->counter_base + idx, | 1026 | err = checking_wrmsrl(hwc->counter_base + idx, |
912 | (u64)(-left) & x86_pmu.counter_mask); | 1027 | (u64)(-left) & x86_pmu.counter_mask); |
913 | 1028 | ||
1029 | perf_counter_update_userpage(counter); | ||
1030 | |||
914 | return ret; | 1031 | return ret; |
915 | } | 1032 | } |
916 | 1033 | ||
@@ -940,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) | |||
940 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | 1057 | err = checking_wrmsrl(hwc->config_base, ctrl_val); |
941 | } | 1058 | } |
942 | 1059 | ||
1060 | static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | ||
1061 | { | ||
1062 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
1063 | u64 val; | ||
1064 | |||
1065 | val = hwc->config; | ||
1066 | if (cpuc->enabled) | ||
1067 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
1068 | |||
1069 | (void)checking_wrmsrl(hwc->config_base + idx, val); | ||
1070 | } | ||
1071 | |||
1072 | |||
943 | static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 1073 | static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) |
944 | { | 1074 | { |
945 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 1075 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
@@ -956,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | |||
956 | 1086 | ||
957 | if (cpuc->enabled) | 1087 | if (cpuc->enabled) |
958 | x86_pmu_enable_counter(hwc, idx); | 1088 | x86_pmu_enable_counter(hwc, idx); |
959 | else | ||
960 | x86_pmu_disable_counter(hwc, idx); | ||
961 | } | 1089 | } |
962 | 1090 | ||
963 | static int | 1091 | static int |
@@ -968,13 +1096,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) | |||
968 | if (!x86_pmu.num_counters_fixed) | 1096 | if (!x86_pmu.num_counters_fixed) |
969 | return -1; | 1097 | return -1; |
970 | 1098 | ||
971 | /* | ||
972 | * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: | ||
973 | */ | ||
974 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | ||
975 | boot_cpu_data.x86_model == 28) | ||
976 | return -1; | ||
977 | |||
978 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; | 1099 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; |
979 | 1100 | ||
980 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) | 1101 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) |
@@ -1040,6 +1161,8 @@ try_generic: | |||
1040 | x86_perf_counter_set_period(counter, hwc, idx); | 1161 | x86_perf_counter_set_period(counter, hwc, idx); |
1041 | x86_pmu.enable(hwc, idx); | 1162 | x86_pmu.enable(hwc, idx); |
1042 | 1163 | ||
1164 | perf_counter_update_userpage(counter); | ||
1165 | |||
1043 | return 0; | 1166 | return 0; |
1044 | } | 1167 | } |
1045 | 1168 | ||
@@ -1132,6 +1255,8 @@ static void x86_pmu_disable(struct perf_counter *counter) | |||
1132 | x86_perf_counter_update(counter, hwc, idx); | 1255 | x86_perf_counter_update(counter, hwc, idx); |
1133 | cpuc->counters[idx] = NULL; | 1256 | cpuc->counters[idx] = NULL; |
1134 | clear_bit(idx, cpuc->used_mask); | 1257 | clear_bit(idx, cpuc->used_mask); |
1258 | |||
1259 | perf_counter_update_userpage(counter); | ||
1135 | } | 1260 | } |
1136 | 1261 | ||
1137 | /* | 1262 | /* |
@@ -1176,6 +1301,49 @@ static void intel_pmu_reset(void) | |||
1176 | local_irq_restore(flags); | 1301 | local_irq_restore(flags); |
1177 | } | 1302 | } |
1178 | 1303 | ||
1304 | static int p6_pmu_handle_irq(struct pt_regs *regs) | ||
1305 | { | ||
1306 | struct perf_sample_data data; | ||
1307 | struct cpu_hw_counters *cpuc; | ||
1308 | struct perf_counter *counter; | ||
1309 | struct hw_perf_counter *hwc; | ||
1310 | int idx, handled = 0; | ||
1311 | u64 val; | ||
1312 | |||
1313 | data.regs = regs; | ||
1314 | data.addr = 0; | ||
1315 | |||
1316 | cpuc = &__get_cpu_var(cpu_hw_counters); | ||
1317 | |||
1318 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
1319 | if (!test_bit(idx, cpuc->active_mask)) | ||
1320 | continue; | ||
1321 | |||
1322 | counter = cpuc->counters[idx]; | ||
1323 | hwc = &counter->hw; | ||
1324 | |||
1325 | val = x86_perf_counter_update(counter, hwc, idx); | ||
1326 | if (val & (1ULL << (x86_pmu.counter_bits - 1))) | ||
1327 | continue; | ||
1328 | |||
1329 | /* | ||
1330 | * counter overflow | ||
1331 | */ | ||
1332 | handled = 1; | ||
1333 | data.period = counter->hw.last_period; | ||
1334 | |||
1335 | if (!x86_perf_counter_set_period(counter, hwc, idx)) | ||
1336 | continue; | ||
1337 | |||
1338 | if (perf_counter_overflow(counter, 1, &data)) | ||
1339 | p6_pmu_disable_counter(hwc, idx); | ||
1340 | } | ||
1341 | |||
1342 | if (handled) | ||
1343 | inc_irq_stat(apic_perf_irqs); | ||
1344 | |||
1345 | return handled; | ||
1346 | } | ||
1179 | 1347 | ||
1180 | /* | 1348 | /* |
1181 | * This handler is triggered by the local APIC, so the APIC IRQ handling | 1349 | * This handler is triggered by the local APIC, so the APIC IRQ handling |
@@ -1185,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
1185 | { | 1353 | { |
1186 | struct perf_sample_data data; | 1354 | struct perf_sample_data data; |
1187 | struct cpu_hw_counters *cpuc; | 1355 | struct cpu_hw_counters *cpuc; |
1188 | int bit, cpu, loops; | 1356 | int bit, loops; |
1189 | u64 ack, status; | 1357 | u64 ack, status; |
1190 | 1358 | ||
1191 | data.regs = regs; | 1359 | data.regs = regs; |
1192 | data.addr = 0; | 1360 | data.addr = 0; |
1193 | 1361 | ||
1194 | cpu = smp_processor_id(); | 1362 | cpuc = &__get_cpu_var(cpu_hw_counters); |
1195 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1196 | 1363 | ||
1197 | perf_disable(); | 1364 | perf_disable(); |
1198 | status = intel_pmu_get_status(); | 1365 | status = intel_pmu_get_status(); |
@@ -1223,6 +1390,8 @@ again: | |||
1223 | if (!intel_pmu_save_and_restart(counter)) | 1390 | if (!intel_pmu_save_and_restart(counter)) |
1224 | continue; | 1391 | continue; |
1225 | 1392 | ||
1393 | data.period = counter->hw.last_period; | ||
1394 | |||
1226 | if (perf_counter_overflow(counter, 1, &data)) | 1395 | if (perf_counter_overflow(counter, 1, &data)) |
1227 | intel_pmu_disable_counter(&counter->hw, bit); | 1396 | intel_pmu_disable_counter(&counter->hw, bit); |
1228 | } | 1397 | } |
@@ -1247,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) | |||
1247 | struct cpu_hw_counters *cpuc; | 1416 | struct cpu_hw_counters *cpuc; |
1248 | struct perf_counter *counter; | 1417 | struct perf_counter *counter; |
1249 | struct hw_perf_counter *hwc; | 1418 | struct hw_perf_counter *hwc; |
1250 | int cpu, idx, handled = 0; | 1419 | int idx, handled = 0; |
1251 | u64 val; | 1420 | u64 val; |
1252 | 1421 | ||
1253 | data.regs = regs; | 1422 | data.regs = regs; |
1254 | data.addr = 0; | 1423 | data.addr = 0; |
1255 | 1424 | ||
1256 | cpu = smp_processor_id(); | 1425 | cpuc = &__get_cpu_var(cpu_hw_counters); |
1257 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1258 | 1426 | ||
1259 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1427 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
1260 | if (!test_bit(idx, cpuc->active_mask)) | 1428 | if (!test_bit(idx, cpuc->active_mask)) |
@@ -1297,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) | |||
1297 | 1465 | ||
1298 | void set_perf_counter_pending(void) | 1466 | void set_perf_counter_pending(void) |
1299 | { | 1467 | { |
1468 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1300 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); | 1469 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); |
1470 | #endif | ||
1301 | } | 1471 | } |
1302 | 1472 | ||
1303 | void perf_counters_lapic_init(void) | 1473 | void perf_counters_lapic_init(void) |
1304 | { | 1474 | { |
1305 | if (!x86_pmu_initialized()) | 1475 | #ifdef CONFIG_X86_LOCAL_APIC |
1476 | if (!x86_pmu.apic || !x86_pmu_initialized()) | ||
1306 | return; | 1477 | return; |
1307 | 1478 | ||
1308 | /* | 1479 | /* |
1309 | * Always use NMI for PMU | 1480 | * Always use NMI for PMU |
1310 | */ | 1481 | */ |
1311 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1482 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1483 | #endif | ||
1312 | } | 1484 | } |
1313 | 1485 | ||
1314 | static int __kprobes | 1486 | static int __kprobes |
@@ -1332,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, | |||
1332 | 1504 | ||
1333 | regs = args->regs; | 1505 | regs = args->regs; |
1334 | 1506 | ||
1507 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1335 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1508 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1509 | #endif | ||
1336 | /* | 1510 | /* |
1337 | * Can't rely on the handled return value to say it was our NMI, two | 1511 | * Can't rely on the handled return value to say it was our NMI, two |
1338 | * counters could trigger 'simultaneously' raising two back-to-back NMIs. | 1512 | * counters could trigger 'simultaneously' raising two back-to-back NMIs. |
@@ -1351,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { | |||
1351 | .priority = 1 | 1525 | .priority = 1 |
1352 | }; | 1526 | }; |
1353 | 1527 | ||
1528 | static struct x86_pmu p6_pmu = { | ||
1529 | .name = "p6", | ||
1530 | .handle_irq = p6_pmu_handle_irq, | ||
1531 | .disable_all = p6_pmu_disable_all, | ||
1532 | .enable_all = p6_pmu_enable_all, | ||
1533 | .enable = p6_pmu_enable_counter, | ||
1534 | .disable = p6_pmu_disable_counter, | ||
1535 | .eventsel = MSR_P6_EVNTSEL0, | ||
1536 | .perfctr = MSR_P6_PERFCTR0, | ||
1537 | .event_map = p6_pmu_event_map, | ||
1538 | .raw_event = p6_pmu_raw_event, | ||
1539 | .max_events = ARRAY_SIZE(p6_perfmon_event_map), | ||
1540 | .apic = 1, | ||
1541 | .max_period = (1ULL << 31) - 1, | ||
1542 | .version = 0, | ||
1543 | .num_counters = 2, | ||
1544 | /* | ||
1545 | * Counters have 40 bits implemented. However they are designed such | ||
1546 | * that bits [32-39] are sign extensions of bit 31. As such the | ||
1547 | * effective width of a counter for P6-like PMU is 32 bits only. | ||
1548 | * | ||
1549 | * See IA-32 Intel Architecture Software developer manual Vol 3B | ||
1550 | */ | ||
1551 | .counter_bits = 32, | ||
1552 | .counter_mask = (1ULL << 32) - 1, | ||
1553 | }; | ||
1554 | |||
1354 | static struct x86_pmu intel_pmu = { | 1555 | static struct x86_pmu intel_pmu = { |
1355 | .name = "Intel", | 1556 | .name = "Intel", |
1356 | .handle_irq = intel_pmu_handle_irq, | 1557 | .handle_irq = intel_pmu_handle_irq, |
@@ -1363,6 +1564,7 @@ static struct x86_pmu intel_pmu = { | |||
1363 | .event_map = intel_pmu_event_map, | 1564 | .event_map = intel_pmu_event_map, |
1364 | .raw_event = intel_pmu_raw_event, | 1565 | .raw_event = intel_pmu_raw_event, |
1365 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), | 1566 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), |
1567 | .apic = 1, | ||
1366 | /* | 1568 | /* |
1367 | * Intel PMCs cannot be accessed sanely above 32 bit width, | 1569 | * Intel PMCs cannot be accessed sanely above 32 bit width, |
1368 | * so we install an artificial 1<<31 period regardless of | 1570 | * so we install an artificial 1<<31 period regardless of |
@@ -1386,10 +1588,43 @@ static struct x86_pmu amd_pmu = { | |||
1386 | .num_counters = 4, | 1588 | .num_counters = 4, |
1387 | .counter_bits = 48, | 1589 | .counter_bits = 48, |
1388 | .counter_mask = (1ULL << 48) - 1, | 1590 | .counter_mask = (1ULL << 48) - 1, |
1591 | .apic = 1, | ||
1389 | /* use highest bit to detect overflow */ | 1592 | /* use highest bit to detect overflow */ |
1390 | .max_period = (1ULL << 47) - 1, | 1593 | .max_period = (1ULL << 47) - 1, |
1391 | }; | 1594 | }; |
1392 | 1595 | ||
1596 | static int p6_pmu_init(void) | ||
1597 | { | ||
1598 | switch (boot_cpu_data.x86_model) { | ||
1599 | case 1: | ||
1600 | case 3: /* Pentium Pro */ | ||
1601 | case 5: | ||
1602 | case 6: /* Pentium II */ | ||
1603 | case 7: | ||
1604 | case 8: | ||
1605 | case 11: /* Pentium III */ | ||
1606 | break; | ||
1607 | case 9: | ||
1608 | case 13: | ||
1609 | /* Pentium M */ | ||
1610 | break; | ||
1611 | default: | ||
1612 | pr_cont("unsupported p6 CPU model %d ", | ||
1613 | boot_cpu_data.x86_model); | ||
1614 | return -ENODEV; | ||
1615 | } | ||
1616 | |||
1617 | x86_pmu = p6_pmu; | ||
1618 | |||
1619 | if (!cpu_has_apic) { | ||
1620 | pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); | ||
1621 | pr_info("no hardware sampling interrupt available.\n"); | ||
1622 | x86_pmu.apic = 0; | ||
1623 | } | ||
1624 | |||
1625 | return 0; | ||
1626 | } | ||
1627 | |||
1393 | static int intel_pmu_init(void) | 1628 | static int intel_pmu_init(void) |
1394 | { | 1629 | { |
1395 | union cpuid10_edx edx; | 1630 | union cpuid10_edx edx; |
@@ -1398,8 +1633,14 @@ static int intel_pmu_init(void) | |||
1398 | unsigned int ebx; | 1633 | unsigned int ebx; |
1399 | int version; | 1634 | int version; |
1400 | 1635 | ||
1401 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | 1636 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { |
1637 | /* check for P6 processor family */ | ||
1638 | if (boot_cpu_data.x86 == 6) { | ||
1639 | return p6_pmu_init(); | ||
1640 | } else { | ||
1402 | return -ENODEV; | 1641 | return -ENODEV; |
1642 | } | ||
1643 | } | ||
1403 | 1644 | ||
1404 | /* | 1645 | /* |
1405 | * Check whether the Architectural PerfMon supports | 1646 | * Check whether the Architectural PerfMon supports |
@@ -1425,8 +1666,6 @@ static int intel_pmu_init(void) | |||
1425 | */ | 1666 | */ |
1426 | x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); | 1667 | x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); |
1427 | 1668 | ||
1428 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | ||
1429 | |||
1430 | /* | 1669 | /* |
1431 | * Install the hw-cache-events table: | 1670 | * Install the hw-cache-events table: |
1432 | */ | 1671 | */ |
@@ -1459,18 +1698,16 @@ static int intel_pmu_init(void) | |||
1459 | 1698 | ||
1460 | static int amd_pmu_init(void) | 1699 | static int amd_pmu_init(void) |
1461 | { | 1700 | { |
1701 | /* Performance-monitoring supported from K7 and later: */ | ||
1702 | if (boot_cpu_data.x86 < 6) | ||
1703 | return -ENODEV; | ||
1704 | |||
1462 | x86_pmu = amd_pmu; | 1705 | x86_pmu = amd_pmu; |
1463 | 1706 | ||
1464 | switch (boot_cpu_data.x86) { | 1707 | /* Events are common for all AMDs */ |
1465 | case 0x0f: | 1708 | memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, |
1466 | case 0x10: | 1709 | sizeof(hw_cache_event_ids)); |
1467 | case 0x11: | ||
1468 | memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, | ||
1469 | sizeof(hw_cache_event_ids)); | ||
1470 | 1710 | ||
1471 | pr_cont("AMD Family 0f/10/11 events, "); | ||
1472 | break; | ||
1473 | } | ||
1474 | return 0; | 1711 | return 0; |
1475 | } | 1712 | } |
1476 | 1713 | ||
@@ -1498,21 +1735,22 @@ void __init init_hw_perf_counters(void) | |||
1498 | pr_cont("%s PMU driver.\n", x86_pmu.name); | 1735 | pr_cont("%s PMU driver.\n", x86_pmu.name); |
1499 | 1736 | ||
1500 | if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { | 1737 | if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { |
1501 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; | ||
1502 | WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", | 1738 | WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", |
1503 | x86_pmu.num_counters, X86_PMC_MAX_GENERIC); | 1739 | x86_pmu.num_counters, X86_PMC_MAX_GENERIC); |
1740 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; | ||
1504 | } | 1741 | } |
1505 | perf_counter_mask = (1 << x86_pmu.num_counters) - 1; | 1742 | perf_counter_mask = (1 << x86_pmu.num_counters) - 1; |
1506 | perf_max_counters = x86_pmu.num_counters; | 1743 | perf_max_counters = x86_pmu.num_counters; |
1507 | 1744 | ||
1508 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { | 1745 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { |
1509 | x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; | ||
1510 | WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", | 1746 | WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", |
1511 | x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); | 1747 | x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); |
1748 | x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; | ||
1512 | } | 1749 | } |
1513 | 1750 | ||
1514 | perf_counter_mask |= | 1751 | perf_counter_mask |= |
1515 | ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; | 1752 | ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; |
1753 | x86_pmu.intel_ctrl = perf_counter_mask; | ||
1516 | 1754 | ||
1517 | perf_counters_lapic_init(); | 1755 | perf_counters_lapic_init(); |
1518 | register_die_notifier(&perf_counter_nmi_notifier); | 1756 | register_die_notifier(&perf_counter_nmi_notifier); |
@@ -1554,14 +1792,15 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) | |||
1554 | */ | 1792 | */ |
1555 | 1793 | ||
1556 | static inline | 1794 | static inline |
1557 | void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) | 1795 | void callchain_store(struct perf_callchain_entry *entry, u64 ip) |
1558 | { | 1796 | { |
1559 | if (entry->nr < MAX_STACK_DEPTH) | 1797 | if (entry->nr < PERF_MAX_STACK_DEPTH) |
1560 | entry->ip[entry->nr++] = ip; | 1798 | entry->ip[entry->nr++] = ip; |
1561 | } | 1799 | } |
1562 | 1800 | ||
1563 | static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); | 1801 | static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); |
1564 | static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); | 1802 | static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); |
1803 | static DEFINE_PER_CPU(int, in_nmi_frame); | ||
1565 | 1804 | ||
1566 | 1805 | ||
1567 | static void | 1806 | static void |
@@ -1577,14 +1816,19 @@ static void backtrace_warning(void *data, char *msg) | |||
1577 | 1816 | ||
1578 | static int backtrace_stack(void *data, char *name) | 1817 | static int backtrace_stack(void *data, char *name) |
1579 | { | 1818 | { |
1580 | /* Don't bother with IRQ stacks for now */ | 1819 | per_cpu(in_nmi_frame, smp_processor_id()) = |
1581 | return -1; | 1820 | x86_is_stack_id(NMI_STACK, name); |
1821 | |||
1822 | return 0; | ||
1582 | } | 1823 | } |
1583 | 1824 | ||
1584 | static void backtrace_address(void *data, unsigned long addr, int reliable) | 1825 | static void backtrace_address(void *data, unsigned long addr, int reliable) |
1585 | { | 1826 | { |
1586 | struct perf_callchain_entry *entry = data; | 1827 | struct perf_callchain_entry *entry = data; |
1587 | 1828 | ||
1829 | if (per_cpu(in_nmi_frame, smp_processor_id())) | ||
1830 | return; | ||
1831 | |||
1588 | if (reliable) | 1832 | if (reliable) |
1589 | callchain_store(entry, addr); | 1833 | callchain_store(entry, addr); |
1590 | } | 1834 | } |
@@ -1596,47 +1840,59 @@ static const struct stacktrace_ops backtrace_ops = { | |||
1596 | .address = backtrace_address, | 1840 | .address = backtrace_address, |
1597 | }; | 1841 | }; |
1598 | 1842 | ||
1843 | #include "../dumpstack.h" | ||
1844 | |||
1599 | static void | 1845 | static void |
1600 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1846 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) |
1601 | { | 1847 | { |
1602 | unsigned long bp; | 1848 | callchain_store(entry, PERF_CONTEXT_KERNEL); |
1603 | char *stack; | 1849 | callchain_store(entry, regs->ip); |
1604 | int nr = entry->nr; | ||
1605 | 1850 | ||
1606 | callchain_store(entry, instruction_pointer(regs)); | 1851 | dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); |
1852 | } | ||
1607 | 1853 | ||
1608 | stack = ((char *)regs + sizeof(struct pt_regs)); | 1854 | /* |
1609 | #ifdef CONFIG_FRAME_POINTER | 1855 | * best effort, GUP based copy_from_user() that assumes IRQ or NMI context |
1610 | bp = frame_pointer(regs); | 1856 | */ |
1611 | #else | 1857 | static unsigned long |
1612 | bp = 0; | 1858 | copy_from_user_nmi(void *to, const void __user *from, unsigned long n) |
1613 | #endif | 1859 | { |
1860 | unsigned long offset, addr = (unsigned long)from; | ||
1861 | int type = in_nmi() ? KM_NMI : KM_IRQ0; | ||
1862 | unsigned long size, len = 0; | ||
1863 | struct page *page; | ||
1864 | void *map; | ||
1865 | int ret; | ||
1614 | 1866 | ||
1615 | dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); | 1867 | do { |
1868 | ret = __get_user_pages_fast(addr, 1, 0, &page); | ||
1869 | if (!ret) | ||
1870 | break; | ||
1616 | 1871 | ||
1617 | entry->kernel = entry->nr - nr; | 1872 | offset = addr & (PAGE_SIZE - 1); |
1618 | } | 1873 | size = min(PAGE_SIZE - offset, n - len); |
1619 | 1874 | ||
1875 | map = kmap_atomic(page, type); | ||
1876 | memcpy(to, map+offset, size); | ||
1877 | kunmap_atomic(map, type); | ||
1878 | put_page(page); | ||
1620 | 1879 | ||
1621 | struct stack_frame { | 1880 | len += size; |
1622 | const void __user *next_fp; | 1881 | to += size; |
1623 | unsigned long return_address; | 1882 | addr += size; |
1624 | }; | 1883 | |
1884 | } while (len < n); | ||
1885 | |||
1886 | return len; | ||
1887 | } | ||
1625 | 1888 | ||
1626 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | 1889 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) |
1627 | { | 1890 | { |
1628 | int ret; | 1891 | unsigned long bytes; |
1629 | |||
1630 | if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) | ||
1631 | return 0; | ||
1632 | 1892 | ||
1633 | ret = 1; | 1893 | bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); |
1634 | pagefault_disable(); | ||
1635 | if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) | ||
1636 | ret = 0; | ||
1637 | pagefault_enable(); | ||
1638 | 1894 | ||
1639 | return ret; | 1895 | return bytes == sizeof(*frame); |
1640 | } | 1896 | } |
1641 | 1897 | ||
1642 | static void | 1898 | static void |
@@ -1644,28 +1900,28 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) | |||
1644 | { | 1900 | { |
1645 | struct stack_frame frame; | 1901 | struct stack_frame frame; |
1646 | const void __user *fp; | 1902 | const void __user *fp; |
1647 | int nr = entry->nr; | ||
1648 | 1903 | ||
1649 | regs = (struct pt_regs *)current->thread.sp0 - 1; | 1904 | if (!user_mode(regs)) |
1650 | fp = (void __user *)regs->bp; | 1905 | regs = task_pt_regs(current); |
1906 | |||
1907 | fp = (void __user *)regs->bp; | ||
1651 | 1908 | ||
1909 | callchain_store(entry, PERF_CONTEXT_USER); | ||
1652 | callchain_store(entry, regs->ip); | 1910 | callchain_store(entry, regs->ip); |
1653 | 1911 | ||
1654 | while (entry->nr < MAX_STACK_DEPTH) { | 1912 | while (entry->nr < PERF_MAX_STACK_DEPTH) { |
1655 | frame.next_fp = NULL; | 1913 | frame.next_frame = NULL; |
1656 | frame.return_address = 0; | 1914 | frame.return_address = 0; |
1657 | 1915 | ||
1658 | if (!copy_stack_frame(fp, &frame)) | 1916 | if (!copy_stack_frame(fp, &frame)) |
1659 | break; | 1917 | break; |
1660 | 1918 | ||
1661 | if ((unsigned long)fp < user_stack_pointer(regs)) | 1919 | if ((unsigned long)fp < regs->sp) |
1662 | break; | 1920 | break; |
1663 | 1921 | ||
1664 | callchain_store(entry, frame.return_address); | 1922 | callchain_store(entry, frame.return_address); |
1665 | fp = frame.next_fp; | 1923 | fp = frame.next_frame; |
1666 | } | 1924 | } |
1667 | |||
1668 | entry->user = entry->nr - nr; | ||
1669 | } | 1925 | } |
1670 | 1926 | ||
1671 | static void | 1927 | static void |
@@ -1701,9 +1957,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1701 | entry = &__get_cpu_var(irq_entry); | 1957 | entry = &__get_cpu_var(irq_entry); |
1702 | 1958 | ||
1703 | entry->nr = 0; | 1959 | entry->nr = 0; |
1704 | entry->hv = 0; | ||
1705 | entry->kernel = 0; | ||
1706 | entry->user = 0; | ||
1707 | 1960 | ||
1708 | perf_do_callchain(regs, entry); | 1961 | perf_do_callchain(regs, entry); |
1709 | 1962 | ||
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d6f5b9fbde32..e60ed740d2b3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void) | |||
716 | wd_ops = &k7_wd_ops; | 716 | wd_ops = &k7_wd_ops; |
717 | break; | 717 | break; |
718 | case X86_VENDOR_INTEL: | 718 | case X86_VENDOR_INTEL: |
719 | /* | 719 | /* Work around where perfctr1 doesn't have a working enable |
720 | * Work around Core Duo (Yonah) errata AE49 where perfctr1 | 720 | * bit as described in the following errata: |
721 | * doesn't have a working enable bit. | 721 | * AE49 Core Duo and Intel Core Solo 65 nm |
722 | * AN49 Intel Pentium Dual-Core | ||
723 | * AF49 Dual-Core Intel Xeon Processor LV | ||
722 | */ | 724 | */ |
723 | if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { | 725 | if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || |
726 | ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && | ||
727 | boot_cpu_data.x86_mask == 4))) { | ||
724 | intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; | 728 | intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; |
725 | intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; | 729 | intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; |
726 | } | 730 | } |
@@ -799,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz) | |||
799 | wd_ops->rearm(wd, nmi_hz); | 803 | wd_ops->rearm(wd, nmi_hz); |
800 | return 1; | 804 | return 1; |
801 | } | 805 | } |
802 | |||
803 | int lapic_watchdog_ok(void) | ||
804 | { | ||
805 | return wd_ops != NULL; | ||
806 | } | ||
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ff958248e61d..5e409dc298a4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <asm/cpu.h> | 27 | #include <asm/cpu.h> |
28 | #include <asm/reboot.h> | 28 | #include <asm/reboot.h> |
29 | #include <asm/virtext.h> | 29 | #include <asm/virtext.h> |
30 | #include <asm/iommu.h> | ||
30 | 31 | ||
31 | 32 | ||
32 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) | 33 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
@@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs) | |||
103 | #ifdef CONFIG_HPET_TIMER | 104 | #ifdef CONFIG_HPET_TIMER |
104 | hpet_disable(); | 105 | hpet_disable(); |
105 | #endif | 106 | #endif |
107 | |||
108 | #ifdef CONFIG_X86_64 | ||
109 | pci_iommu_shutdown(); | ||
110 | #endif | ||
111 | |||
106 | crash_save_cpu(regs, safe_smp_processor_id()); | 112 | crash_save_cpu(regs, safe_smp_processor_id()); |
107 | } | 113 | } |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 95ea5fa7d444..c8405718a4c3 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include "dumpstack.h" | 22 | #include "dumpstack.h" |
23 | 23 | ||
24 | int panic_on_unrecovered_nmi; | 24 | int panic_on_unrecovered_nmi; |
25 | int panic_on_io_nmi; | ||
25 | unsigned int code_bytes = 64; | 26 | unsigned int code_bytes = 64; |
26 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; | 27 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; |
27 | static int die_counter; | 28 | static int die_counter; |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d593cd1f58dc..bca5fba91c9e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -19,6 +19,12 @@ | |||
19 | 19 | ||
20 | #include "dumpstack.h" | 20 | #include "dumpstack.h" |
21 | 21 | ||
22 | /* Just a stub for now */ | ||
23 | int x86_is_stack_id(int id, char *name) | ||
24 | { | ||
25 | return 0; | ||
26 | } | ||
27 | |||
22 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 28 | void dump_trace(struct task_struct *task, struct pt_regs *regs, |
23 | unsigned long *stack, unsigned long bp, | 29 | unsigned long *stack, unsigned long bp, |
24 | const struct stacktrace_ops *ops, void *data) | 30 | const struct stacktrace_ops *ops, void *data) |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d35db5993fd6..54b0a3276766 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -19,10 +19,8 @@ | |||
19 | 19 | ||
20 | #include "dumpstack.h" | 20 | #include "dumpstack.h" |
21 | 21 | ||
22 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | 22 | |
23 | unsigned *usedp, char **idp) | 23 | static char x86_stack_ids[][8] = { |
24 | { | ||
25 | static char ids[][8] = { | ||
26 | [DEBUG_STACK - 1] = "#DB", | 24 | [DEBUG_STACK - 1] = "#DB", |
27 | [NMI_STACK - 1] = "NMI", | 25 | [NMI_STACK - 1] = "NMI", |
28 | [DOUBLEFAULT_STACK - 1] = "#DF", | 26 | [DOUBLEFAULT_STACK - 1] = "#DF", |
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | |||
33 | N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | 31 | N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" |
34 | #endif | 32 | #endif |
35 | }; | 33 | }; |
34 | |||
35 | int x86_is_stack_id(int id, char *name) | ||
36 | { | ||
37 | return x86_stack_ids[id - 1] == name; | ||
38 | } | ||
39 | |||
40 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
41 | unsigned *usedp, char **idp) | ||
42 | { | ||
36 | unsigned k; | 43 | unsigned k; |
37 | 44 | ||
38 | /* | 45 | /* |
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | |||
61 | if (*usedp & (1U << k)) | 68 | if (*usedp & (1U << k)) |
62 | break; | 69 | break; |
63 | *usedp |= 1U << k; | 70 | *usedp |= 1U << k; |
64 | *idp = ids[k]; | 71 | *idp = x86_stack_ids[k]; |
65 | return (unsigned long *)end; | 72 | return (unsigned long *)end; |
66 | } | 73 | } |
67 | /* | 74 | /* |
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | |||
81 | do { | 88 | do { |
82 | ++j; | 89 | ++j; |
83 | end -= EXCEPTION_STKSZ; | 90 | end -= EXCEPTION_STKSZ; |
84 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | 91 | x86_stack_ids[j][4] = '1' + |
92 | (j - N_EXCEPTION_STACKS); | ||
85 | } while (stack < end - EXCEPTION_STKSZ); | 93 | } while (stack < end - EXCEPTION_STKSZ); |
86 | if (*usedp & (1U << j)) | 94 | if (*usedp & (1U << j)) |
87 | break; | 95 | break; |
88 | *usedp |= 1U << j; | 96 | *usedp |= 1U << j; |
89 | *idp = ids[j]; | 97 | *idp = x86_stack_ids[j]; |
90 | return (unsigned long *)end; | 98 | return (unsigned long *)end; |
91 | } | 99 | } |
92 | #endif | 100 | #endif |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7271fa33d791..5cb5725b2bae 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -627,10 +627,9 @@ __init void e820_setup_gap(void) | |||
627 | #ifdef CONFIG_X86_64 | 627 | #ifdef CONFIG_X86_64 |
628 | if (!found) { | 628 | if (!found) { |
629 | gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; | 629 | gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; |
630 | printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " | 630 | printk(KERN_ERR |
631 | "address range\n" | 631 | "PCI: Warning: Cannot find a gap in the 32bit address range\n" |
632 | KERN_ERR "PCI: Unassigned devices with 32bit resource " | 632 | "PCI: Unassigned devices with 32bit resource registers may break!\n"); |
633 | "registers may break!\n"); | ||
634 | } | 633 | } |
635 | #endif | 634 | #endif |
636 | 635 | ||
@@ -1383,6 +1382,8 @@ static unsigned long ram_alignment(resource_size_t pos) | |||
1383 | return 32*1024*1024; | 1382 | return 32*1024*1024; |
1384 | } | 1383 | } |
1385 | 1384 | ||
1385 | #define MAX_RESOURCE_SIZE ((resource_size_t)-1) | ||
1386 | |||
1386 | void __init e820_reserve_resources_late(void) | 1387 | void __init e820_reserve_resources_late(void) |
1387 | { | 1388 | { |
1388 | int i; | 1389 | int i; |
@@ -1400,17 +1401,19 @@ void __init e820_reserve_resources_late(void) | |||
1400 | * avoid stolen RAM: | 1401 | * avoid stolen RAM: |
1401 | */ | 1402 | */ |
1402 | for (i = 0; i < e820.nr_map; i++) { | 1403 | for (i = 0; i < e820.nr_map; i++) { |
1403 | struct e820entry *entry = &e820_saved.map[i]; | 1404 | struct e820entry *entry = &e820.map[i]; |
1404 | resource_size_t start, end; | 1405 | u64 start, end; |
1405 | 1406 | ||
1406 | if (entry->type != E820_RAM) | 1407 | if (entry->type != E820_RAM) |
1407 | continue; | 1408 | continue; |
1408 | start = entry->addr + entry->size; | 1409 | start = entry->addr + entry->size; |
1409 | end = round_up(start, ram_alignment(start)); | 1410 | end = round_up(start, ram_alignment(start)) - 1; |
1410 | if (start == end) | 1411 | if (end > MAX_RESOURCE_SIZE) |
1412 | end = MAX_RESOURCE_SIZE; | ||
1413 | if (start >= end) | ||
1411 | continue; | 1414 | continue; |
1412 | reserve_region_with_split(&iomem_resource, start, | 1415 | reserve_region_with_split(&iomem_resource, start, end, |
1413 | end - 1, "RAM buffer"); | 1416 | "RAM buffer"); |
1414 | } | 1417 | } |
1415 | } | 1418 | } |
1416 | 1419 | ||
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1736acc4d7aa..fe26ba3e3451 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
@@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void) | |||
240 | unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; | 240 | unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; |
241 | int e820_type; | 241 | int e820_type; |
242 | 242 | ||
243 | if (md->attribute & EFI_MEMORY_WB) | 243 | switch (md->type) { |
244 | e820_type = E820_RAM; | 244 | case EFI_LOADER_CODE: |
245 | else | 245 | case EFI_LOADER_DATA: |
246 | case EFI_BOOT_SERVICES_CODE: | ||
247 | case EFI_BOOT_SERVICES_DATA: | ||
248 | case EFI_CONVENTIONAL_MEMORY: | ||
249 | if (md->attribute & EFI_MEMORY_WB) | ||
250 | e820_type = E820_RAM; | ||
251 | else | ||
252 | e820_type = E820_RESERVED; | ||
253 | break; | ||
254 | case EFI_ACPI_RECLAIM_MEMORY: | ||
255 | e820_type = E820_ACPI; | ||
256 | break; | ||
257 | case EFI_ACPI_MEMORY_NVS: | ||
258 | e820_type = E820_NVS; | ||
259 | break; | ||
260 | case EFI_UNUSABLE_MEMORY: | ||
261 | e820_type = E820_UNUSABLE; | ||
262 | break; | ||
263 | default: | ||
264 | /* | ||
265 | * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE | ||
266 | * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO | ||
267 | * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE | ||
268 | */ | ||
246 | e820_type = E820_RESERVED; | 269 | e820_type = E820_RESERVED; |
270 | break; | ||
271 | } | ||
247 | e820_add_region(start, size, e820_type); | 272 | e820_add_region(start, size, e820_type); |
248 | } | 273 | } |
249 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 274 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
@@ -329,7 +354,7 @@ void __init efi_init(void) | |||
329 | */ | 354 | */ |
330 | c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); | 355 | c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); |
331 | if (c16) { | 356 | if (c16) { |
332 | for (i = 0; i < sizeof(vendor) && *c16; ++i) | 357 | for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) |
333 | vendor[i] = *c16++; | 358 | vendor[i] = *c16++; |
334 | vendor[i] = '\0'; | 359 | vendor[i] = '\0'; |
335 | } else | 360 | } else |
@@ -487,7 +512,7 @@ void __init efi_enter_virtual_mode(void) | |||
487 | && end_pfn <= max_pfn_mapped)) | 512 | && end_pfn <= max_pfn_mapped)) |
488 | va = __va(md->phys_addr); | 513 | va = __va(md->phys_addr); |
489 | else | 514 | else |
490 | va = efi_ioremap(md->phys_addr, size); | 515 | va = efi_ioremap(md->phys_addr, size, md->type); |
491 | 516 | ||
492 | md->virt_addr = (u64) (unsigned long) va; | 517 | md->virt_addr = (u64) (unsigned long) va; |
493 | 518 | ||
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b7828c50..ac0621a7ac3d 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c | |||
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) | |||
98 | early_runtime_code_mapping_set_exec(0); | 98 | early_runtime_code_mapping_set_exec(0); |
99 | } | 99 | } |
100 | 100 | ||
101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) | 101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, |
102 | u32 type) | ||
102 | { | 103 | { |
103 | unsigned long last_map_pfn; | 104 | unsigned long last_map_pfn; |
104 | 105 | ||
106 | if (type == EFI_MEMORY_MAPPED_IO) | ||
107 | return ioremap(phys_addr, size); | ||
108 | |||
105 | last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); | 109 | last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); |
106 | if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) | 110 | if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) |
107 | return NULL; | 111 | return NULL; |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c929add475c9..c097e7d607c6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -48,7 +48,6 @@ | |||
48 | #include <asm/segment.h> | 48 | #include <asm/segment.h> |
49 | #include <asm/smp.h> | 49 | #include <asm/smp.h> |
50 | #include <asm/page_types.h> | 50 | #include <asm/page_types.h> |
51 | #include <asm/desc.h> | ||
52 | #include <asm/percpu.h> | 51 | #include <asm/percpu.h> |
53 | #include <asm/dwarf2.h> | 52 | #include <asm/dwarf2.h> |
54 | #include <asm/processor-flags.h> | 53 | #include <asm/processor-flags.h> |
@@ -84,7 +83,7 @@ | |||
84 | #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF | 83 | #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF |
85 | #else | 84 | #else |
86 | #define preempt_stop(clobbers) | 85 | #define preempt_stop(clobbers) |
87 | #define resume_kernel restore_nocheck | 86 | #define resume_kernel restore_all |
88 | #endif | 87 | #endif |
89 | 88 | ||
90 | .macro TRACE_IRQS_IRET | 89 | .macro TRACE_IRQS_IRET |
@@ -372,7 +371,7 @@ END(ret_from_exception) | |||
372 | ENTRY(resume_kernel) | 371 | ENTRY(resume_kernel) |
373 | DISABLE_INTERRUPTS(CLBR_ANY) | 372 | DISABLE_INTERRUPTS(CLBR_ANY) |
374 | cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? | 373 | cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? |
375 | jnz restore_nocheck | 374 | jnz restore_all |
376 | need_resched: | 375 | need_resched: |
377 | movl TI_flags(%ebp), %ecx # need_resched set ? | 376 | movl TI_flags(%ebp), %ecx # need_resched set ? |
378 | testb $_TIF_NEED_RESCHED, %cl | 377 | testb $_TIF_NEED_RESCHED, %cl |
@@ -540,6 +539,8 @@ syscall_exit: | |||
540 | jne syscall_exit_work | 539 | jne syscall_exit_work |
541 | 540 | ||
542 | restore_all: | 541 | restore_all: |
542 | TRACE_IRQS_IRET | ||
543 | restore_all_notrace: | ||
543 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS | 544 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS |
544 | # Warning: PT_OLDSS(%esp) contains the wrong/random values if we | 545 | # Warning: PT_OLDSS(%esp) contains the wrong/random values if we |
545 | # are returning to the kernel. | 546 | # are returning to the kernel. |
@@ -551,8 +552,6 @@ restore_all: | |||
551 | CFI_REMEMBER_STATE | 552 | CFI_REMEMBER_STATE |
552 | je ldt_ss # returning to user-space with LDT SS | 553 | je ldt_ss # returning to user-space with LDT SS |
553 | restore_nocheck: | 554 | restore_nocheck: |
554 | TRACE_IRQS_IRET | ||
555 | restore_nocheck_notrace: | ||
556 | RESTORE_REGS 4 # skip orig_eax/error_code | 555 | RESTORE_REGS 4 # skip orig_eax/error_code |
557 | CFI_ADJUST_CFA_OFFSET -4 | 556 | CFI_ADJUST_CFA_OFFSET -4 |
558 | irq_return: | 557 | irq_return: |
@@ -588,22 +587,34 @@ ldt_ss: | |||
588 | jne restore_nocheck | 587 | jne restore_nocheck |
589 | #endif | 588 | #endif |
590 | 589 | ||
591 | /* If returning to userspace with 16bit stack, | 590 | /* |
592 | * try to fix the higher word of ESP, as the CPU | 591 | * Setup and switch to ESPFIX stack |
593 | * won't restore it. | 592 | * |
594 | * This is an "official" bug of all the x86-compatible | 593 | * We're returning to userspace with a 16 bit stack. The CPU will not |
595 | * CPUs, which we can try to work around to make | 594 | * restore the high word of ESP for us on executing iret... This is an |
596 | * dosemu and wine happy. */ | 595 | * "official" bug of all the x86-compatible CPUs, which we can work |
597 | movl PT_OLDESP(%esp), %eax | 596 | * around to make dosemu and wine happy. We do this by preloading the |
598 | movl %esp, %edx | 597 | * high word of ESP with the high word of the userspace ESP while |
599 | call patch_espfix_desc | 598 | * compensating for the offset by changing to the ESPFIX segment with |
599 | * a base address that matches for the difference. | ||
600 | */ | ||
601 | mov %esp, %edx /* load kernel esp */ | ||
602 | mov PT_OLDESP(%esp), %eax /* load userspace esp */ | ||
603 | mov %dx, %ax /* eax: new kernel esp */ | ||
604 | sub %eax, %edx /* offset (low word is 0) */ | ||
605 | PER_CPU(gdt_page, %ebx) | ||
606 | shr $16, %edx | ||
607 | mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ | ||
608 | mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ | ||
600 | pushl $__ESPFIX_SS | 609 | pushl $__ESPFIX_SS |
601 | CFI_ADJUST_CFA_OFFSET 4 | 610 | CFI_ADJUST_CFA_OFFSET 4 |
602 | pushl %eax | 611 | push %eax /* new kernel esp */ |
603 | CFI_ADJUST_CFA_OFFSET 4 | 612 | CFI_ADJUST_CFA_OFFSET 4 |
613 | /* Disable interrupts, but do not irqtrace this section: we | ||
614 | * will soon execute iret and the tracer was already set to | ||
615 | * the irqstate after the iret */ | ||
604 | DISABLE_INTERRUPTS(CLBR_EAX) | 616 | DISABLE_INTERRUPTS(CLBR_EAX) |
605 | TRACE_IRQS_OFF | 617 | lss (%esp), %esp /* switch to espfix segment */ |
606 | lss (%esp), %esp | ||
607 | CFI_ADJUST_CFA_OFFSET -8 | 618 | CFI_ADJUST_CFA_OFFSET -8 |
608 | jmp restore_nocheck | 619 | jmp restore_nocheck |
609 | CFI_ENDPROC | 620 | CFI_ENDPROC |
@@ -716,15 +727,24 @@ PTREGSCALL(vm86) | |||
716 | PTREGSCALL(vm86old) | 727 | PTREGSCALL(vm86old) |
717 | 728 | ||
718 | .macro FIXUP_ESPFIX_STACK | 729 | .macro FIXUP_ESPFIX_STACK |
719 | /* since we are on a wrong stack, we cant make it a C code :( */ | 730 | /* |
731 | * Switch back for ESPFIX stack to the normal zerobased stack | ||
732 | * | ||
733 | * We can't call C functions using the ESPFIX stack. This code reads | ||
734 | * the high word of the segment base from the GDT and swiches to the | ||
735 | * normal stack and adjusts ESP with the matching offset. | ||
736 | */ | ||
737 | /* fixup the stack */ | ||
720 | PER_CPU(gdt_page, %ebx) | 738 | PER_CPU(gdt_page, %ebx) |
721 | GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) | 739 | mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ |
722 | addl %esp, %eax | 740 | mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ |
741 | shl $16, %eax | ||
742 | addl %esp, %eax /* the adjusted stack pointer */ | ||
723 | pushl $__KERNEL_DS | 743 | pushl $__KERNEL_DS |
724 | CFI_ADJUST_CFA_OFFSET 4 | 744 | CFI_ADJUST_CFA_OFFSET 4 |
725 | pushl %eax | 745 | pushl %eax |
726 | CFI_ADJUST_CFA_OFFSET 4 | 746 | CFI_ADJUST_CFA_OFFSET 4 |
727 | lss (%esp), %esp | 747 | lss (%esp), %esp /* switch to the normal stack segment */ |
728 | CFI_ADJUST_CFA_OFFSET -8 | 748 | CFI_ADJUST_CFA_OFFSET -8 |
729 | .endm | 749 | .endm |
730 | .macro UNWIND_ESPFIX_STACK | 750 | .macro UNWIND_ESPFIX_STACK |
@@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller) | |||
1154 | pushl %edx | 1174 | pushl %edx |
1155 | movl 0xc(%esp), %edx | 1175 | movl 0xc(%esp), %edx |
1156 | lea 0x4(%ebp), %eax | 1176 | lea 0x4(%ebp), %eax |
1177 | movl (%ebp), %ecx | ||
1157 | subl $MCOUNT_INSN_SIZE, %edx | 1178 | subl $MCOUNT_INSN_SIZE, %edx |
1158 | call prepare_ftrace_return | 1179 | call prepare_ftrace_return |
1159 | popl %edx | 1180 | popl %edx |
@@ -1168,6 +1189,7 @@ return_to_handler: | |||
1168 | pushl %eax | 1189 | pushl %eax |
1169 | pushl %ecx | 1190 | pushl %ecx |
1170 | pushl %edx | 1191 | pushl %edx |
1192 | movl %ebp, %eax | ||
1171 | call ftrace_return_to_handler | 1193 | call ftrace_return_to_handler |
1172 | movl %eax, 0xc(%esp) | 1194 | movl %eax, 0xc(%esp) |
1173 | popl %edx | 1195 | popl %edx |
@@ -1329,7 +1351,7 @@ nmi_stack_correct: | |||
1329 | xorl %edx,%edx # zero error code | 1351 | xorl %edx,%edx # zero error code |
1330 | movl %esp,%eax # pt_regs pointer | 1352 | movl %esp,%eax # pt_regs pointer |
1331 | call do_nmi | 1353 | call do_nmi |
1332 | jmp restore_nocheck_notrace | 1354 | jmp restore_all_notrace |
1333 | CFI_ENDPROC | 1355 | CFI_ENDPROC |
1334 | 1356 | ||
1335 | nmi_stack_fixup: | 1357 | nmi_stack_fixup: |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index de74f0a3e0ed..c251be745107 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller) | |||
135 | 135 | ||
136 | leaq 8(%rbp), %rdi | 136 | leaq 8(%rbp), %rdi |
137 | movq 0x38(%rsp), %rsi | 137 | movq 0x38(%rsp), %rsi |
138 | movq (%rbp), %rdx | ||
138 | subq $MCOUNT_INSN_SIZE, %rsi | 139 | subq $MCOUNT_INSN_SIZE, %rsi |
139 | 140 | ||
140 | call prepare_ftrace_return | 141 | call prepare_ftrace_return |
@@ -150,6 +151,7 @@ GLOBAL(return_to_handler) | |||
150 | /* Save the return values */ | 151 | /* Save the return values */ |
151 | movq %rax, (%rsp) | 152 | movq %rax, (%rsp) |
152 | movq %rdx, 8(%rsp) | 153 | movq %rdx, 8(%rsp) |
154 | movq %rbp, %rdi | ||
153 | 155 | ||
154 | call ftrace_return_to_handler | 156 | call ftrace_return_to_handler |
155 | 157 | ||
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b79c5533c421..9dbb527e1652 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void) | |||
408 | * Hook the return address and push it in the stack of return addrs | 408 | * Hook the return address and push it in the stack of return addrs |
409 | * in current thread info. | 409 | * in current thread info. |
410 | */ | 410 | */ |
411 | void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) | 411 | void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, |
412 | unsigned long frame_pointer) | ||
412 | { | 413 | { |
413 | unsigned long old; | 414 | unsigned long old; |
414 | int faulted; | 415 | int faulted; |
@@ -416,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) | |||
416 | unsigned long return_hooker = (unsigned long) | 417 | unsigned long return_hooker = (unsigned long) |
417 | &return_to_handler; | 418 | &return_to_handler; |
418 | 419 | ||
419 | /* Nmi's are currently unsupported */ | ||
420 | if (unlikely(in_nmi())) | ||
421 | return; | ||
422 | |||
423 | if (unlikely(atomic_read(¤t->tracing_graph_pause))) | 420 | if (unlikely(atomic_read(¤t->tracing_graph_pause))) |
424 | return; | 421 | return; |
425 | 422 | ||
@@ -453,7 +450,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) | |||
453 | return; | 450 | return; |
454 | } | 451 | } |
455 | 452 | ||
456 | if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { | 453 | if (ftrace_push_return_trace(old, self_addr, &trace.depth, |
454 | frame_pointer) == -EBUSY) { | ||
457 | *parent = old; | 455 | *parent = old; |
458 | return; | 456 | return; |
459 | } | 457 | } |
@@ -496,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) | |||
496 | 494 | ||
497 | struct syscall_metadata *syscall_nr_to_meta(int nr) | 495 | struct syscall_metadata *syscall_nr_to_meta(int nr) |
498 | { | 496 | { |
499 | if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) | 497 | if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) |
500 | return NULL; | 498 | return NULL; |
501 | 499 | ||
502 | return syscalls_metadata[nr]; | 500 | return syscalls_metadata[nr]; |
503 | } | 501 | } |
504 | 502 | ||
505 | void arch_init_ftrace_syscalls(void) | 503 | int syscall_name_to_nr(char *name) |
504 | { | ||
505 | int i; | ||
506 | |||
507 | if (!syscalls_metadata) | ||
508 | return -1; | ||
509 | |||
510 | for (i = 0; i < NR_syscalls; i++) { | ||
511 | if (syscalls_metadata[i]) { | ||
512 | if (!strcmp(syscalls_metadata[i]->name, name)) | ||
513 | return i; | ||
514 | } | ||
515 | } | ||
516 | return -1; | ||
517 | } | ||
518 | |||
519 | void set_syscall_enter_id(int num, int id) | ||
520 | { | ||
521 | syscalls_metadata[num]->enter_id = id; | ||
522 | } | ||
523 | |||
524 | void set_syscall_exit_id(int num, int id) | ||
525 | { | ||
526 | syscalls_metadata[num]->exit_id = id; | ||
527 | } | ||
528 | |||
529 | static int __init arch_init_ftrace_syscalls(void) | ||
506 | { | 530 | { |
507 | int i; | 531 | int i; |
508 | struct syscall_metadata *meta; | 532 | struct syscall_metadata *meta; |
509 | unsigned long **psys_syscall_table = &sys_call_table; | 533 | unsigned long **psys_syscall_table = &sys_call_table; |
510 | static atomic_t refs; | ||
511 | |||
512 | if (atomic_inc_return(&refs) != 1) | ||
513 | goto end; | ||
514 | 534 | ||
515 | syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * | 535 | syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * |
516 | FTRACE_SYSCALL_MAX, GFP_KERNEL); | 536 | NR_syscalls, GFP_KERNEL); |
517 | if (!syscalls_metadata) { | 537 | if (!syscalls_metadata) { |
518 | WARN_ON(1); | 538 | WARN_ON(1); |
519 | return; | 539 | return -ENOMEM; |
520 | } | 540 | } |
521 | 541 | ||
522 | for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { | 542 | for (i = 0; i < NR_syscalls; i++) { |
523 | meta = find_syscall_meta(psys_syscall_table[i]); | 543 | meta = find_syscall_meta(psys_syscall_table[i]); |
524 | syscalls_metadata[i] = meta; | 544 | syscalls_metadata[i] = meta; |
525 | } | 545 | } |
526 | return; | 546 | return 0; |
527 | |||
528 | /* Paranoid: avoid overflow */ | ||
529 | end: | ||
530 | atomic_dec(&refs); | ||
531 | } | 547 | } |
548 | arch_initcall(arch_init_ftrace_syscalls); | ||
532 | #endif | 549 | #endif |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index dc5ed4bdd88d..cc827ac9e8d3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <asm/segment.h> | 13 | #include <asm/segment.h> |
14 | #include <asm/page_types.h> | 14 | #include <asm/page_types.h> |
15 | #include <asm/pgtable_types.h> | 15 | #include <asm/pgtable_types.h> |
16 | #include <asm/desc.h> | ||
17 | #include <asm/cache.h> | 16 | #include <asm/cache.h> |
18 | #include <asm/thread_info.h> | 17 | #include <asm/thread_info.h> |
19 | #include <asm/asm-offsets.h> | 18 | #include <asm/asm-offsets.h> |
@@ -262,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
262 | * which will be freed later | 261 | * which will be freed later |
263 | */ | 262 | */ |
264 | 263 | ||
265 | #ifndef CONFIG_HOTPLUG_CPU | 264 | __CPUINIT |
266 | .section .init.text,"ax",@progbits | ||
267 | #endif | ||
268 | 265 | ||
269 | #ifdef CONFIG_SMP | 266 | #ifdef CONFIG_SMP |
270 | ENTRY(startup_32_smp) | 267 | ENTRY(startup_32_smp) |
@@ -603,7 +600,7 @@ ignore_int: | |||
603 | #endif | 600 | #endif |
604 | iret | 601 | iret |
605 | 602 | ||
606 | .section .cpuinit.data,"wa" | 603 | __REFDATA |
607 | .align 4 | 604 | .align 4 |
608 | ENTRY(initial_code) | 605 | ENTRY(initial_code) |
609 | .long i386_start_kernel | 606 | .long i386_start_kernel |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 54b29bb24e71..fa54f78e2a05 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <linux/linkage.h> | 12 | #include <linux/linkage.h> |
13 | #include <linux/threads.h> | 13 | #include <linux/threads.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <asm/desc.h> | ||
16 | #include <asm/segment.h> | 15 | #include <asm/segment.h> |
17 | #include <asm/pgtable.h> | 16 | #include <asm/pgtable.h> |
18 | #include <asm/page.h> | 17 | #include <asm/page.h> |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 81408b93f887..dedc2bddf7a5 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev) | |||
510 | { | 510 | { |
511 | 511 | ||
512 | if (request_irq(dev->irq, hpet_interrupt_handler, | 512 | if (request_irq(dev->irq, hpet_interrupt_handler, |
513 | IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) | 513 | IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, |
514 | dev->name, dev)) | ||
514 | return -1; | 515 | return -1; |
515 | 516 | ||
516 | disable_irq(dev->irq); | 517 | disable_irq(dev->irq); |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2d..92b7703d3d58 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void) | |||
187 | #ifdef CONFIG_X86_THERMAL_VECTOR | 187 | #ifdef CONFIG_X86_THERMAL_VECTOR |
188 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 188 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); |
189 | #endif | 189 | #endif |
190 | #ifdef CONFIG_X86_THRESHOLD | 190 | #ifdef CONFIG_X86_MCE_THRESHOLD |
191 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | 191 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); |
192 | #endif | 192 | #endif |
193 | #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) | 193 | #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a78ecad0c900..c664d515f613 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void) | |||
200 | state->mode = paravirt_get_lazy_mode(); | 200 | state->mode = paravirt_get_lazy_mode(); |
201 | } | 201 | } |
202 | 202 | ||
203 | static void paravirt_ops_setup(void) | 203 | static void __init paravirt_ops_setup(void) |
204 | { | 204 | { |
205 | pv_info.name = "KVM"; | 205 | pv_info.name = "KVM"; |
206 | pv_info.paravirt_enabled = 1; | 206 | pv_info.paravirt_enabled = 1; |
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 846510b78a09..2a62d843f015 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c | |||
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) | |||
347 | 347 | ||
348 | static struct irqaction mfgptirq = { | 348 | static struct irqaction mfgptirq = { |
349 | .handler = mfgpt_tick, | 349 | .handler = mfgpt_tick, |
350 | .flags = IRQF_DISABLED | IRQF_NOBALANCING, | 350 | .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, |
351 | .name = "mfgpt-timer" | 351 | .name = "mfgpt-timer" |
352 | }; | 352 | }; |
353 | 353 | ||
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 745579bc8256..1a041bcf506b 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -32,6 +32,8 @@ int no_iommu __read_mostly; | |||
32 | /* Set this to 1 if there is a HW IOMMU in the system */ | 32 | /* Set this to 1 if there is a HW IOMMU in the system */ |
33 | int iommu_detected __read_mostly = 0; | 33 | int iommu_detected __read_mostly = 0; |
34 | 34 | ||
35 | int iommu_pass_through; | ||
36 | |||
35 | dma_addr_t bad_dma_address __read_mostly = 0; | 37 | dma_addr_t bad_dma_address __read_mostly = 0; |
36 | EXPORT_SYMBOL(bad_dma_address); | 38 | EXPORT_SYMBOL(bad_dma_address); |
37 | 39 | ||
@@ -210,6 +212,10 @@ static __init int iommu_setup(char *p) | |||
210 | if (!strncmp(p, "soft", 4)) | 212 | if (!strncmp(p, "soft", 4)) |
211 | swiotlb = 1; | 213 | swiotlb = 1; |
212 | #endif | 214 | #endif |
215 | if (!strncmp(p, "pt", 2)) { | ||
216 | iommu_pass_through = 1; | ||
217 | return 1; | ||
218 | } | ||
213 | 219 | ||
214 | gart_parse_options(p); | 220 | gart_parse_options(p); |
215 | 221 | ||
@@ -290,6 +296,8 @@ static int __init pci_iommu_init(void) | |||
290 | void pci_iommu_shutdown(void) | 296 | void pci_iommu_shutdown(void) |
291 | { | 297 | { |
292 | gart_iommu_shutdown(); | 298 | gart_iommu_shutdown(); |
299 | |||
300 | amd_iommu_shutdown(); | ||
293 | } | 301 | } |
294 | /* Must execute after PCI subsystem */ | 302 | /* Must execute after PCI subsystem */ |
295 | fs_initcall(pci_iommu_init); | 303 | fs_initcall(pci_iommu_init); |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cfd9f9063896..d2e56b8f48e7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
675 | nommu: | 675 | nommu: |
676 | /* Should not happen anymore */ | 676 | /* Should not happen anymore */ |
677 | printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" | 677 | printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" |
678 | KERN_WARNING "falling back to iommu=soft.\n"); | 678 | "falling back to iommu=soft.\n"); |
679 | return -1; | 679 | return -1; |
680 | } | 680 | } |
681 | 681 | ||
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index a1712f2b50f1..6af96ee44200 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c | |||
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void) | |||
71 | { | 71 | { |
72 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ | 72 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ |
73 | #ifdef CONFIG_X86_64 | 73 | #ifdef CONFIG_X86_64 |
74 | if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) | 74 | if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || |
75 | iommu_pass_through) | ||
75 | swiotlb = 1; | 76 | swiotlb = 1; |
76 | #endif | 77 | #endif |
77 | if (swiotlb_force) | 78 | if (swiotlb_force) |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fc6e4b773fc4..1092a1a2fbe6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -509,16 +509,12 @@ static void c1e_idle(void) | |||
509 | if (!cpumask_test_cpu(cpu, c1e_mask)) { | 509 | if (!cpumask_test_cpu(cpu, c1e_mask)) { |
510 | cpumask_set_cpu(cpu, c1e_mask); | 510 | cpumask_set_cpu(cpu, c1e_mask); |
511 | /* | 511 | /* |
512 | * Force broadcast so ACPI can not interfere. Needs | 512 | * Force broadcast so ACPI can not interfere. |
513 | * to run with interrupts enabled as it uses | ||
514 | * smp_function_call. | ||
515 | */ | 513 | */ |
516 | local_irq_enable(); | ||
517 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, | 514 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, |
518 | &cpu); | 515 | &cpu); |
519 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", | 516 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", |
520 | cpu); | 517 | cpu); |
521 | local_irq_disable(); | ||
522 | } | 518 | } |
523 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 519 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); |
524 | 520 | ||
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index cabdabce3cb2..113b8927c822 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -36,10 +36,11 @@ | |||
36 | #include <asm/ds.h> | 36 | #include <asm/ds.h> |
37 | #include <asm/hw_breakpoint.h> | 37 | #include <asm/hw_breakpoint.h> |
38 | 38 | ||
39 | #include <trace/syscall.h> | ||
40 | |||
41 | #include "tls.h" | 39 | #include "tls.h" |
42 | 40 | ||
41 | #define CREATE_TRACE_POINTS | ||
42 | #include <trace/events/syscalls.h> | ||
43 | |||
43 | enum x86_regset { | 44 | enum x86_regset { |
44 | REGSET_GENERAL, | 45 | REGSET_GENERAL, |
45 | REGSET_FP, | 46 | REGSET_FP, |
@@ -1548,8 +1549,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) | |||
1548 | tracehook_report_syscall_entry(regs)) | 1549 | tracehook_report_syscall_entry(regs)) |
1549 | ret = -1L; | 1550 | ret = -1L; |
1550 | 1551 | ||
1551 | if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) | 1552 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1552 | ftrace_syscall_enter(regs); | 1553 | trace_sys_enter(regs, regs->orig_ax); |
1553 | 1554 | ||
1554 | if (unlikely(current->audit_context)) { | 1555 | if (unlikely(current->audit_context)) { |
1555 | if (IS_IA32) | 1556 | if (IS_IA32) |
@@ -1574,8 +1575,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) | |||
1574 | if (unlikely(current->audit_context)) | 1575 | if (unlikely(current->audit_context)) |
1575 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); | 1576 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); |
1576 | 1577 | ||
1577 | if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) | 1578 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1578 | ftrace_syscall_exit(regs); | 1579 | trace_sys_exit(regs, regs->ax); |
1579 | 1580 | ||
1580 | if (test_thread_flag(TIF_SYSCALL_TRACE)) | 1581 | if (test_thread_flag(TIF_SYSCALL_TRACE)) |
1581 | tracehook_report_syscall_exit(regs, 0); | 1582 | tracehook_report_syscall_exit(regs, 0); |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 4f9c55f3a7c0..03801f2f761f 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | |||
60 | "adc %5,%%edx ; " | 60 | "adc %5,%%edx ; " |
61 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | 61 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) |
62 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | 62 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); |
63 | #elif __x86_64__ | 63 | #elif defined(__x86_64__) |
64 | __asm__ ( | 64 | __asm__ ( |
65 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | 65 | "mul %%rdx ; shrd $32,%%rdx,%%rax" |
66 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | 66 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8170f0..a06e8d101844 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/init.h> | 3 | #include <linux/init.h> |
4 | #include <linux/pm.h> | 4 | #include <linux/pm.h> |
5 | #include <linux/efi.h> | 5 | #include <linux/efi.h> |
6 | #include <linux/dmi.h> | ||
6 | #include <acpi/reboot.h> | 7 | #include <acpi/reboot.h> |
7 | #include <asm/io.h> | 8 | #include <asm/io.h> |
8 | #include <asm/apic.h> | 9 | #include <asm/apic.h> |
@@ -17,7 +18,6 @@ | |||
17 | #include <asm/cpu.h> | 18 | #include <asm/cpu.h> |
18 | 19 | ||
19 | #ifdef CONFIG_X86_32 | 20 | #ifdef CONFIG_X86_32 |
20 | # include <linux/dmi.h> | ||
21 | # include <linux/ctype.h> | 21 | # include <linux/ctype.h> |
22 | # include <linux/mc146818rtc.h> | 22 | # include <linux/mc146818rtc.h> |
23 | #else | 23 | #else |
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
249 | DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), | 249 | DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), |
250 | }, | 250 | }, |
251 | }, | 251 | }, |
252 | { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ | ||
253 | .callback = set_bios_reboot, | ||
254 | .ident = "CompuLab SBC-FITPC2", | ||
255 | .matches = { | ||
256 | DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), | ||
257 | DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), | ||
258 | }, | ||
259 | }, | ||
252 | { } | 260 | { } |
253 | }; | 261 | }; |
254 | 262 | ||
@@ -396,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart); | |||
396 | 404 | ||
397 | #endif /* CONFIG_X86_32 */ | 405 | #endif /* CONFIG_X86_32 */ |
398 | 406 | ||
407 | /* | ||
408 | * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot | ||
409 | */ | ||
410 | static int __init set_pci_reboot(const struct dmi_system_id *d) | ||
411 | { | ||
412 | if (reboot_type != BOOT_CF9) { | ||
413 | reboot_type = BOOT_CF9; | ||
414 | printk(KERN_INFO "%s series board detected. " | ||
415 | "Selecting PCI-method for reboots.\n", d->ident); | ||
416 | } | ||
417 | return 0; | ||
418 | } | ||
419 | |||
420 | static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { | ||
421 | { /* Handle problems with rebooting on Apple MacBook5 */ | ||
422 | .callback = set_pci_reboot, | ||
423 | .ident = "Apple MacBook5", | ||
424 | .matches = { | ||
425 | DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), | ||
426 | DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), | ||
427 | }, | ||
428 | }, | ||
429 | { /* Handle problems with rebooting on Apple MacBookPro5 */ | ||
430 | .callback = set_pci_reboot, | ||
431 | .ident = "Apple MacBookPro5", | ||
432 | .matches = { | ||
433 | DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), | ||
434 | DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), | ||
435 | }, | ||
436 | }, | ||
437 | { } | ||
438 | }; | ||
439 | |||
440 | static int __init pci_reboot_init(void) | ||
441 | { | ||
442 | dmi_check_system(pci_reboot_dmi_table); | ||
443 | return 0; | ||
444 | } | ||
445 | core_initcall(pci_reboot_init); | ||
446 | |||
399 | static inline void kb_wait(void) | 447 | static inline void kb_wait(void) |
400 | { | 448 | { |
401 | int i; | 449 | int i; |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index be5ae80f897f..63f32d220ef2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) | |||
289 | return ret; | 289 | return ret; |
290 | } | 290 | } |
291 | 291 | ||
292 | #ifdef CONFIG_X86_64 | ||
293 | static void __init init_gbpages(void) | ||
294 | { | ||
295 | if (direct_gbpages && cpu_has_gbpages) | ||
296 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
297 | else | ||
298 | direct_gbpages = 0; | ||
299 | } | ||
300 | #else | ||
301 | static inline void init_gbpages(void) | ||
302 | { | ||
303 | } | ||
304 | #endif | ||
305 | |||
292 | static void __init reserve_brk(void) | 306 | static void __init reserve_brk(void) |
293 | { | 307 | { |
294 | if (_brk_end > _brk_start) | 308 | if (_brk_end > _brk_start) |
@@ -658,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { | |||
658 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), | 672 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), |
659 | }, | 673 | }, |
660 | }, | 674 | }, |
675 | { | ||
676 | /* | ||
677 | * AMI BIOS with low memory corruption was found on Intel DG45ID board. | ||
678 | * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will | ||
679 | * match only DMI_BOARD_NAME and see if there is more bad products | ||
680 | * with this vendor. | ||
681 | */ | ||
682 | .callback = dmi_low_memory_corruption, | ||
683 | .ident = "AMI BIOS", | ||
684 | .matches = { | ||
685 | DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), | ||
686 | }, | ||
687 | }, | ||
661 | #endif | 688 | #endif |
662 | {} | 689 | {} |
663 | }; | 690 | }; |
@@ -871,6 +898,8 @@ void __init setup_arch(char **cmdline_p) | |||
871 | 898 | ||
872 | reserve_brk(); | 899 | reserve_brk(); |
873 | 900 | ||
901 | init_gbpages(); | ||
902 | |||
874 | /* max_pfn_mapped is updated here */ | 903 | /* max_pfn_mapped is updated here */ |
875 | max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); | 904 | max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); |
876 | max_pfn_mapped = max_low_pfn_mapped; | 905 | max_pfn_mapped = max_low_pfn_mapped; |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f0823e6aa..07d81916f212 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, | |||
124 | } | 124 | } |
125 | 125 | ||
126 | /* | 126 | /* |
127 | * Remap allocator | 127 | * Large page remap allocator |
128 | * | 128 | * |
129 | * This allocator uses PMD page as unit. A PMD page is allocated for | 129 | * This allocator uses PMD page as unit. A PMD page is allocated for |
130 | * each cpu and each is remapped into vmalloc area using PMD mapping. | 130 | * each cpu and each is remapped into vmalloc area using PMD mapping. |
@@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, | |||
137 | * better than only using 4k mappings while still being NUMA friendly. | 137 | * better than only using 4k mappings while still being NUMA friendly. |
138 | */ | 138 | */ |
139 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 139 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
140 | static size_t pcpur_size __initdata; | 140 | struct pcpul_ent { |
141 | static void **pcpur_ptrs __initdata; | 141 | unsigned int cpu; |
142 | void *ptr; | ||
143 | }; | ||
144 | |||
145 | static size_t pcpul_size; | ||
146 | static struct pcpul_ent *pcpul_map; | ||
147 | static struct vm_struct pcpul_vm; | ||
142 | 148 | ||
143 | static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) | 149 | static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) |
144 | { | 150 | { |
145 | size_t off = (size_t)pageno << PAGE_SHIFT; | 151 | size_t off = (size_t)pageno << PAGE_SHIFT; |
146 | 152 | ||
147 | if (off >= pcpur_size) | 153 | if (off >= pcpul_size) |
148 | return NULL; | 154 | return NULL; |
149 | 155 | ||
150 | return virt_to_page(pcpur_ptrs[cpu] + off); | 156 | return virt_to_page(pcpul_map[cpu].ptr + off); |
151 | } | 157 | } |
152 | 158 | ||
153 | static ssize_t __init setup_pcpu_remap(size_t static_size) | 159 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) |
154 | { | 160 | { |
155 | static struct vm_struct vm; | 161 | size_t map_size, dyn_size; |
156 | size_t ptrs_size, dyn_size; | ||
157 | unsigned int cpu; | 162 | unsigned int cpu; |
163 | int i, j; | ||
158 | ssize_t ret; | 164 | ssize_t ret; |
159 | 165 | ||
160 | /* | 166 | if (!chosen) { |
161 | * If large page isn't supported, there's no benefit in doing | 167 | size_t vm_size = VMALLOC_END - VMALLOC_START; |
162 | * this. Also, on non-NUMA, embedding is better. | 168 | size_t tot_size = nr_cpu_ids * PMD_SIZE; |
163 | * | 169 | |
164 | * NOTE: disabled for now. | 170 | /* on non-NUMA, embedding is better */ |
165 | */ | 171 | if (!pcpu_need_numa()) |
166 | if (true || !cpu_has_pse || !pcpu_need_numa()) | 172 | return -EINVAL; |
173 | |||
174 | /* don't consume more than 20% of vmalloc area */ | ||
175 | if (tot_size > vm_size / 5) { | ||
176 | pr_info("PERCPU: too large chunk size %zuMB for " | ||
177 | "large page remap\n", tot_size >> 20); | ||
178 | return -EINVAL; | ||
179 | } | ||
180 | } | ||
181 | |||
182 | /* need PSE */ | ||
183 | if (!cpu_has_pse) { | ||
184 | pr_warning("PERCPU: lpage allocator requires PSE\n"); | ||
167 | return -EINVAL; | 185 | return -EINVAL; |
186 | } | ||
168 | 187 | ||
169 | /* | 188 | /* |
170 | * Currently supports only single page. Supporting multiple | 189 | * Currently supports only single page. Supporting multiple |
171 | * pages won't be too difficult if it ever becomes necessary. | 190 | * pages won't be too difficult if it ever becomes necessary. |
172 | */ | 191 | */ |
173 | pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + | 192 | pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + |
174 | PERCPU_DYNAMIC_RESERVE); | 193 | PERCPU_DYNAMIC_RESERVE); |
175 | if (pcpur_size > PMD_SIZE) { | 194 | if (pcpul_size > PMD_SIZE) { |
176 | pr_warning("PERCPU: static data is larger than large page, " | 195 | pr_warning("PERCPU: static data is larger than large page, " |
177 | "can't use large page\n"); | 196 | "can't use large page\n"); |
178 | return -EINVAL; | 197 | return -EINVAL; |
179 | } | 198 | } |
180 | dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; | 199 | dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; |
181 | 200 | ||
182 | /* allocate pointer array and alloc large pages */ | 201 | /* allocate pointer array and alloc large pages */ |
183 | ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); | 202 | map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); |
184 | pcpur_ptrs = alloc_bootmem(ptrs_size); | 203 | pcpul_map = alloc_bootmem(map_size); |
185 | 204 | ||
186 | for_each_possible_cpu(cpu) { | 205 | for_each_possible_cpu(cpu) { |
187 | pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); | 206 | pcpul_map[cpu].cpu = cpu; |
188 | if (!pcpur_ptrs[cpu]) | 207 | pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, |
208 | PMD_SIZE); | ||
209 | if (!pcpul_map[cpu].ptr) { | ||
210 | pr_warning("PERCPU: failed to allocate large page " | ||
211 | "for cpu%u\n", cpu); | ||
189 | goto enomem; | 212 | goto enomem; |
213 | } | ||
190 | 214 | ||
191 | /* | 215 | /* |
192 | * Only use pcpur_size bytes and give back the rest. | 216 | * Only use pcpul_size bytes and give back the rest. |
193 | * | 217 | * |
194 | * Ingo: The 2MB up-rounding bootmem is needed to make | 218 | * Ingo: The 2MB up-rounding bootmem is needed to make |
195 | * sure the partial 2MB page is still fully RAM - it's | 219 | * sure the partial 2MB page is still fully RAM - it's |
196 | * not well-specified to have a PAT-incompatible area | 220 | * not well-specified to have a PAT-incompatible area |
197 | * (unmapped RAM, device memory, etc.) in that hole. | 221 | * (unmapped RAM, device memory, etc.) in that hole. |
198 | */ | 222 | */ |
199 | free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), | 223 | free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), |
200 | PMD_SIZE - pcpur_size); | 224 | PMD_SIZE - pcpul_size); |
201 | 225 | ||
202 | memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); | 226 | memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); |
203 | } | 227 | } |
204 | 228 | ||
205 | /* allocate address and map */ | 229 | /* allocate address and map */ |
206 | vm.flags = VM_ALLOC; | 230 | pcpul_vm.flags = VM_ALLOC; |
207 | vm.size = num_possible_cpus() * PMD_SIZE; | 231 | pcpul_vm.size = nr_cpu_ids * PMD_SIZE; |
208 | vm_area_register_early(&vm, PMD_SIZE); | 232 | vm_area_register_early(&pcpul_vm, PMD_SIZE); |
209 | 233 | ||
210 | for_each_possible_cpu(cpu) { | 234 | for_each_possible_cpu(cpu) { |
211 | pmd_t *pmd; | 235 | pmd_t *pmd, pmd_v; |
212 | 236 | ||
213 | pmd = populate_extra_pmd((unsigned long)vm.addr | 237 | pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + |
214 | + cpu * PMD_SIZE); | 238 | cpu * PMD_SIZE); |
215 | set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), | 239 | pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), |
216 | PAGE_KERNEL_LARGE)); | 240 | PAGE_KERNEL_LARGE); |
241 | set_pmd(pmd, pmd_v); | ||
217 | } | 242 | } |
218 | 243 | ||
219 | /* we're ready, commit */ | 244 | /* we're ready, commit */ |
220 | pr_info("PERCPU: Remapped at %p with large pages, static data " | 245 | pr_info("PERCPU: Remapped at %p with large pages, static data " |
221 | "%zu bytes\n", vm.addr, static_size); | 246 | "%zu bytes\n", pcpul_vm.addr, static_size); |
222 | 247 | ||
223 | ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, | 248 | ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, |
224 | PERCPU_FIRST_CHUNK_RESERVE, dyn_size, | 249 | PERCPU_FIRST_CHUNK_RESERVE, dyn_size, |
225 | PMD_SIZE, vm.addr, NULL); | 250 | PMD_SIZE, pcpul_vm.addr, NULL); |
226 | goto out_free_ar; | 251 | |
252 | /* sort pcpul_map array for pcpu_lpage_remapped() */ | ||
253 | for (i = 0; i < nr_cpu_ids - 1; i++) | ||
254 | for (j = i + 1; j < nr_cpu_ids; j++) | ||
255 | if (pcpul_map[i].ptr > pcpul_map[j].ptr) { | ||
256 | struct pcpul_ent tmp = pcpul_map[i]; | ||
257 | pcpul_map[i] = pcpul_map[j]; | ||
258 | pcpul_map[j] = tmp; | ||
259 | } | ||
260 | |||
261 | return ret; | ||
227 | 262 | ||
228 | enomem: | 263 | enomem: |
229 | for_each_possible_cpu(cpu) | 264 | for_each_possible_cpu(cpu) |
230 | if (pcpur_ptrs[cpu]) | 265 | if (pcpul_map[cpu].ptr) |
231 | free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); | 266 | free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); |
232 | ret = -ENOMEM; | 267 | free_bootmem(__pa(pcpul_map), map_size); |
233 | out_free_ar: | 268 | return -ENOMEM; |
234 | free_bootmem(__pa(pcpur_ptrs), ptrs_size); | 269 | } |
235 | return ret; | 270 | |
271 | /** | ||
272 | * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area | ||
273 | * @kaddr: the kernel address in question | ||
274 | * | ||
275 | * Determine whether @kaddr falls in the pcpul recycled area. This is | ||
276 | * used by pageattr to detect VM aliases and break up the pcpu PMD | ||
277 | * mapping such that the same physical page is not mapped under | ||
278 | * different attributes. | ||
279 | * | ||
280 | * The recycled area is always at the tail of a partially used PMD | ||
281 | * page. | ||
282 | * | ||
283 | * RETURNS: | ||
284 | * Address of corresponding remapped pcpu address if match is found; | ||
285 | * otherwise, NULL. | ||
286 | */ | ||
287 | void *pcpu_lpage_remapped(void *kaddr) | ||
288 | { | ||
289 | void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); | ||
290 | unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; | ||
291 | int left = 0, right = nr_cpu_ids - 1; | ||
292 | int pos; | ||
293 | |||
294 | /* pcpul in use at all? */ | ||
295 | if (!pcpul_map) | ||
296 | return NULL; | ||
297 | |||
298 | /* okay, perform binary search */ | ||
299 | while (left <= right) { | ||
300 | pos = (left + right) / 2; | ||
301 | |||
302 | if (pcpul_map[pos].ptr < pmd_addr) | ||
303 | left = pos + 1; | ||
304 | else if (pcpul_map[pos].ptr > pmd_addr) | ||
305 | right = pos - 1; | ||
306 | else { | ||
307 | /* it shouldn't be in the area for the first chunk */ | ||
308 | WARN_ON(offset < pcpul_size); | ||
309 | |||
310 | return pcpul_vm.addr + | ||
311 | pcpul_map[pos].cpu * PMD_SIZE + offset; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | return NULL; | ||
236 | } | 316 | } |
237 | #else | 317 | #else |
238 | static ssize_t __init setup_pcpu_remap(size_t static_size) | 318 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) |
239 | { | 319 | { |
240 | return -EINVAL; | 320 | return -EINVAL; |
241 | } | 321 | } |
@@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) | |||
249 | * mapping so that it can use PMD mapping without additional TLB | 329 | * mapping so that it can use PMD mapping without additional TLB |
250 | * pressure. | 330 | * pressure. |
251 | */ | 331 | */ |
252 | static ssize_t __init setup_pcpu_embed(size_t static_size) | 332 | static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) |
253 | { | 333 | { |
254 | size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; | 334 | size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; |
255 | 335 | ||
@@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) | |||
258 | * this. Also, embedding allocation doesn't play well with | 338 | * this. Also, embedding allocation doesn't play well with |
259 | * NUMA. | 339 | * NUMA. |
260 | */ | 340 | */ |
261 | if (!cpu_has_pse || pcpu_need_numa()) | 341 | if (!chosen && (!cpu_has_pse || pcpu_need_numa())) |
262 | return -EINVAL; | 342 | return -EINVAL; |
263 | 343 | ||
264 | return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, | 344 | return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, |
@@ -297,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) | |||
297 | pcpu4k_nr_static_pages = PFN_UP(static_size); | 377 | pcpu4k_nr_static_pages = PFN_UP(static_size); |
298 | 378 | ||
299 | /* unaligned allocations can't be freed, round up to page size */ | 379 | /* unaligned allocations can't be freed, round up to page size */ |
300 | pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() | 380 | pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids |
301 | * sizeof(pcpu4k_pages[0])); | 381 | * sizeof(pcpu4k_pages[0])); |
302 | pcpu4k_pages = alloc_bootmem(pages_size); | 382 | pcpu4k_pages = alloc_bootmem(pages_size); |
303 | 383 | ||
@@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) | |||
308 | void *ptr; | 388 | void *ptr; |
309 | 389 | ||
310 | ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); | 390 | ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); |
311 | if (!ptr) | 391 | if (!ptr) { |
392 | pr_warning("PERCPU: failed to allocate " | ||
393 | "4k page for cpu%u\n", cpu); | ||
312 | goto enomem; | 394 | goto enomem; |
395 | } | ||
313 | 396 | ||
314 | memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); | 397 | memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); |
315 | pcpu4k_pages[j++] = virt_to_page(ptr); | 398 | pcpu4k_pages[j++] = virt_to_page(ptr); |
@@ -333,6 +416,16 @@ out_free_ar: | |||
333 | return ret; | 416 | return ret; |
334 | } | 417 | } |
335 | 418 | ||
419 | /* for explicit first chunk allocator selection */ | ||
420 | static char pcpu_chosen_alloc[16] __initdata; | ||
421 | |||
422 | static int __init percpu_alloc_setup(char *str) | ||
423 | { | ||
424 | strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); | ||
425 | return 0; | ||
426 | } | ||
427 | early_param("percpu_alloc", percpu_alloc_setup); | ||
428 | |||
336 | static inline void setup_percpu_segment(int cpu) | 429 | static inline void setup_percpu_segment(int cpu) |
337 | { | 430 | { |
338 | #ifdef CONFIG_X86_32 | 431 | #ifdef CONFIG_X86_32 |
@@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu) | |||
346 | #endif | 439 | #endif |
347 | } | 440 | } |
348 | 441 | ||
349 | /* | ||
350 | * Great future plan: | ||
351 | * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | ||
352 | * Always point %gs to its beginning | ||
353 | */ | ||
354 | void __init setup_per_cpu_areas(void) | 442 | void __init setup_per_cpu_areas(void) |
355 | { | 443 | { |
356 | size_t static_size = __per_cpu_end - __per_cpu_start; | 444 | size_t static_size = __per_cpu_end - __per_cpu_start; |
@@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void) | |||
367 | * of large page mappings. Please read comments on top of | 455 | * of large page mappings. Please read comments on top of |
368 | * each allocator for details. | 456 | * each allocator for details. |
369 | */ | 457 | */ |
370 | ret = setup_pcpu_remap(static_size); | 458 | ret = -EINVAL; |
371 | if (ret < 0) | 459 | if (strlen(pcpu_chosen_alloc)) { |
372 | ret = setup_pcpu_embed(static_size); | 460 | if (strcmp(pcpu_chosen_alloc, "4k")) { |
461 | if (!strcmp(pcpu_chosen_alloc, "lpage")) | ||
462 | ret = setup_pcpu_lpage(static_size, true); | ||
463 | else if (!strcmp(pcpu_chosen_alloc, "embed")) | ||
464 | ret = setup_pcpu_embed(static_size, true); | ||
465 | else | ||
466 | pr_warning("PERCPU: unknown allocator %s " | ||
467 | "specified\n", pcpu_chosen_alloc); | ||
468 | if (ret < 0) | ||
469 | pr_warning("PERCPU: %s allocator failed (%zd), " | ||
470 | "falling back to 4k\n", | ||
471 | pcpu_chosen_alloc, ret); | ||
472 | } | ||
473 | } else { | ||
474 | ret = setup_pcpu_lpage(static_size, false); | ||
475 | if (ret < 0) | ||
476 | ret = setup_pcpu_embed(static_size, false); | ||
477 | } | ||
373 | if (ret < 0) | 478 | if (ret < 0) |
374 | ret = setup_pcpu_4k(static_size); | 479 | ret = setup_pcpu_4k(static_size); |
375 | if (ret < 0) | 480 | if (ret < 0) |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211accf08..45e00eb09c3a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -18,9 +18,9 @@ | |||
18 | #include <asm/ia32.h> | 18 | #include <asm/ia32.h> |
19 | #include <asm/syscalls.h> | 19 | #include <asm/syscalls.h> |
20 | 20 | ||
21 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, | 21 | SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, |
22 | unsigned long prot, unsigned long flags, | 22 | unsigned long, prot, unsigned long, flags, |
23 | unsigned long fd, unsigned long off) | 23 | unsigned long, fd, unsigned long, off) |
24 | { | 24 | { |
25 | long error; | 25 | long error; |
26 | struct file *file; | 26 | struct file *file; |
@@ -226,7 +226,7 @@ bottomup: | |||
226 | } | 226 | } |
227 | 227 | ||
228 | 228 | ||
229 | asmlinkage long sys_uname(struct new_utsname __user *name) | 229 | SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) |
230 | { | 230 | { |
231 | int err; | 231 | int err; |
232 | down_read(&uts_sem); | 232 | down_read(&uts_sem); |
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 124d40c575df..77b9689f8edb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode) | |||
711 | unsigned long pa; | 711 | unsigned long pa; |
712 | unsigned long m; | 712 | unsigned long m; |
713 | unsigned long n; | 713 | unsigned long n; |
714 | unsigned long mmr_image; | ||
715 | struct bau_desc *adp; | 714 | struct bau_desc *adp; |
716 | struct bau_desc *ad2; | 715 | struct bau_desc *ad2; |
717 | 716 | ||
@@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode) | |||
727 | n = pa >> uv_nshift; | 726 | n = pa >> uv_nshift; |
728 | m = pa & uv_mmask; | 727 | m = pa & uv_mmask; |
729 | 728 | ||
730 | mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); | 729 | uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, |
731 | if (mmr_image) { | 730 | (n << UV_DESC_BASE_PNODE_SHIFT | m)); |
732 | uv_write_global_mmr64(pnode, (unsigned long) | ||
733 | UVH_LB_BAU_SB_DESCRIPTOR_BASE, | ||
734 | (n << UV_DESC_BASE_PNODE_SHIFT | m)); | ||
735 | } | ||
736 | 731 | ||
737 | /* | 732 | /* |
738 | * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each | 733 | * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each |
@@ -749,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) | |||
749 | * note that base_dest_nodeid is actually a nasid. | 744 | * note that base_dest_nodeid is actually a nasid. |
750 | */ | 745 | */ |
751 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; | 746 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; |
747 | ad2->header.dest_subnodeid = 0x10; /* the LB */ | ||
752 | ad2->header.command = UV_NET_ENDPOINT_INTD; | 748 | ad2->header.command = UV_NET_ENDPOINT_INTD; |
753 | ad2->header.int_both = 1; | 749 | ad2->header.int_both = 1; |
754 | /* | 750 | /* |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 286d64eba31b..ae04589a579b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <asm/traps.h> | 54 | #include <asm/traps.h> |
55 | #include <asm/desc.h> | 55 | #include <asm/desc.h> |
56 | #include <asm/i387.h> | 56 | #include <asm/i387.h> |
57 | #include <asm/mce.h> | ||
57 | 58 | ||
58 | #include <asm/mach_traps.h> | 59 | #include <asm/mach_traps.h> |
59 | 60 | ||
@@ -65,8 +66,6 @@ | |||
65 | #include <asm/setup.h> | 66 | #include <asm/setup.h> |
66 | #include <asm/traps.h> | 67 | #include <asm/traps.h> |
67 | 68 | ||
68 | #include "cpu/mcheck/mce.h" | ||
69 | |||
70 | asmlinkage int system_call(void); | 69 | asmlinkage int system_call(void); |
71 | 70 | ||
72 | /* Do we ignore FPU interrupts ? */ | 71 | /* Do we ignore FPU interrupts ? */ |
@@ -347,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs) | |||
347 | printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); | 346 | printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); |
348 | show_registers(regs); | 347 | show_registers(regs); |
349 | 348 | ||
349 | if (panic_on_io_nmi) | ||
350 | panic("NMI IOCK error: Not continuing"); | ||
351 | |||
350 | /* Re-enable the IOCK line, wait for a few seconds */ | 352 | /* Re-enable the IOCK line, wait for a few seconds */ |
351 | reason = (reason & 0xf) | 8; | 353 | reason = (reason & 0xf) | 8; |
352 | outb(reason, 0x61); | 354 | outb(reason, 0x61); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ae3180c506a6..71f4368b357e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) | |||
275 | * use the TSC value at the transitions to calculate a pretty | 275 | * use the TSC value at the transitions to calculate a pretty |
276 | * good value for the TSC frequencty. | 276 | * good value for the TSC frequencty. |
277 | */ | 277 | */ |
278 | static inline int pit_verify_msb(unsigned char val) | ||
279 | { | ||
280 | /* Ignore LSB */ | ||
281 | inb(0x42); | ||
282 | return inb(0x42) == val; | ||
283 | } | ||
284 | |||
278 | static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) | 285 | static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) |
279 | { | 286 | { |
280 | int count; | 287 | int count; |
281 | u64 tsc = 0; | 288 | u64 tsc = 0; |
282 | 289 | ||
283 | for (count = 0; count < 50000; count++) { | 290 | for (count = 0; count < 50000; count++) { |
284 | /* Ignore LSB */ | 291 | if (!pit_verify_msb(val)) |
285 | inb(0x42); | ||
286 | if (inb(0x42) != val) | ||
287 | break; | 292 | break; |
288 | tsc = get_cycles(); | 293 | tsc = get_cycles(); |
289 | } | 294 | } |
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) | |||
336 | * to do that is to just read back the 16-bit counter | 341 | * to do that is to just read back the 16-bit counter |
337 | * once from the PIT. | 342 | * once from the PIT. |
338 | */ | 343 | */ |
339 | inb(0x42); | 344 | pit_verify_msb(0); |
340 | inb(0x42); | ||
341 | 345 | ||
342 | if (pit_expect_msb(0xff, &tsc, &d1)) { | 346 | if (pit_expect_msb(0xff, &tsc, &d1)) { |
343 | for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { | 347 | for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { |
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) | |||
348 | * Iterate until the error is less than 500 ppm | 352 | * Iterate until the error is less than 500 ppm |
349 | */ | 353 | */ |
350 | delta -= tsc; | 354 | delta -= tsc; |
351 | if (d1+d2 < delta >> 11) | 355 | if (d1+d2 >= delta >> 11) |
352 | goto success; | 356 | continue; |
357 | |||
358 | /* | ||
359 | * Check the PIT one more time to verify that | ||
360 | * all TSC reads were stable wrt the PIT. | ||
361 | * | ||
362 | * This also guarantees serialization of the | ||
363 | * last cycle read ('d2') in pit_expect_msb. | ||
364 | */ | ||
365 | if (!pit_verify_msb(0xfe - i)) | ||
366 | break; | ||
367 | goto success; | ||
353 | } | 368 | } |
354 | } | 369 | } |
355 | printk("Fast TSC calibration failed\n"); | 370 | printk("Fast TSC calibration failed\n"); |
@@ -590,22 +605,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); | |||
590 | */ | 605 | */ |
591 | 606 | ||
592 | DEFINE_PER_CPU(unsigned long, cyc2ns); | 607 | DEFINE_PER_CPU(unsigned long, cyc2ns); |
608 | DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); | ||
593 | 609 | ||
594 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | 610 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) |
595 | { | 611 | { |
596 | unsigned long long tsc_now, ns_now; | 612 | unsigned long long tsc_now, ns_now, *offset; |
597 | unsigned long flags, *scale; | 613 | unsigned long flags, *scale; |
598 | 614 | ||
599 | local_irq_save(flags); | 615 | local_irq_save(flags); |
600 | sched_clock_idle_sleep_event(); | 616 | sched_clock_idle_sleep_event(); |
601 | 617 | ||
602 | scale = &per_cpu(cyc2ns, cpu); | 618 | scale = &per_cpu(cyc2ns, cpu); |
619 | offset = &per_cpu(cyc2ns_offset, cpu); | ||
603 | 620 | ||
604 | rdtscll(tsc_now); | 621 | rdtscll(tsc_now); |
605 | ns_now = __cycles_2_ns(tsc_now); | 622 | ns_now = __cycles_2_ns(tsc_now); |
606 | 623 | ||
607 | if (cpu_khz) | 624 | if (cpu_khz) { |
608 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; | 625 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; |
626 | *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); | ||
627 | } | ||
609 | 628 | ||
610 | sched_clock_idle_wakeup_event(0); | 629 | sched_clock_idle_wakeup_event(0); |
611 | local_irq_restore(flags); | 630 | local_irq_restore(flags); |
@@ -632,17 +651,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | |||
632 | void *data) | 651 | void *data) |
633 | { | 652 | { |
634 | struct cpufreq_freqs *freq = data; | 653 | struct cpufreq_freqs *freq = data; |
635 | unsigned long *lpj, dummy; | 654 | unsigned long *lpj; |
636 | 655 | ||
637 | if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) | 656 | if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) |
638 | return 0; | 657 | return 0; |
639 | 658 | ||
640 | lpj = &dummy; | 659 | lpj = &boot_cpu_data.loops_per_jiffy; |
641 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
642 | #ifdef CONFIG_SMP | 660 | #ifdef CONFIG_SMP |
661 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | ||
643 | lpj = &cpu_data(freq->cpu).loops_per_jiffy; | 662 | lpj = &cpu_data(freq->cpu).loops_per_jiffy; |
644 | #else | ||
645 | lpj = &boot_cpu_data.loops_per_jiffy; | ||
646 | #endif | 663 | #endif |
647 | 664 | ||
648 | if (!ref_freq) { | 665 | if (!ref_freq) { |
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423fbe2a..95a7289e4b0c 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, | |||
441 | ap.ds = __USER_DS; | 441 | ap.ds = __USER_DS; |
442 | ap.es = __USER_DS; | 442 | ap.es = __USER_DS; |
443 | ap.fs = __KERNEL_PERCPU; | 443 | ap.fs = __KERNEL_PERCPU; |
444 | ap.gs = 0; | 444 | ap.gs = __KERNEL_STACK_CANARY; |
445 | 445 | ||
446 | ap.eflags = 0; | 446 | ap.eflags = 0; |
447 | 447 | ||
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e87882041..9fc178255c04 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -46,11 +46,10 @@ PHDRS { | |||
46 | data PT_LOAD FLAGS(7); /* RWE */ | 46 | data PT_LOAD FLAGS(7); /* RWE */ |
47 | #ifdef CONFIG_X86_64 | 47 | #ifdef CONFIG_X86_64 |
48 | user PT_LOAD FLAGS(7); /* RWE */ | 48 | user PT_LOAD FLAGS(7); /* RWE */ |
49 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
50 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
51 | percpu PT_LOAD FLAGS(7); /* RWE */ | 50 | percpu PT_LOAD FLAGS(7); /* RWE */ |
52 | #endif | 51 | #endif |
53 | data.init2 PT_LOAD FLAGS(7); /* RWE */ | 52 | init PT_LOAD FLAGS(7); /* RWE */ |
54 | #endif | 53 | #endif |
55 | note PT_NOTE FLAGS(0); /* ___ */ | 54 | note PT_NOTE FLAGS(0); /* ___ */ |
56 | } | 55 | } |
@@ -103,72 +102,43 @@ SECTIONS | |||
103 | __stop___ex_table = .; | 102 | __stop___ex_table = .; |
104 | } :text = 0x9090 | 103 | } :text = 0x9090 |
105 | 104 | ||
106 | RODATA | 105 | RO_DATA(PAGE_SIZE) |
107 | 106 | ||
108 | /* Data */ | 107 | /* Data */ |
109 | . = ALIGN(PAGE_SIZE); | ||
110 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | 108 | .data : AT(ADDR(.data) - LOAD_OFFSET) { |
111 | /* Start of data section */ | 109 | /* Start of data section */ |
112 | _sdata = .; | 110 | _sdata = .; |
113 | DATA_DATA | ||
114 | CONSTRUCTORS | ||
115 | 111 | ||
116 | #ifdef CONFIG_X86_64 | 112 | /* init_task */ |
117 | /* End of data section */ | 113 | INIT_TASK_DATA(THREAD_SIZE) |
118 | _edata = .; | ||
119 | #endif | ||
120 | } :data | ||
121 | 114 | ||
122 | #ifdef CONFIG_X86_32 | 115 | #ifdef CONFIG_X86_32 |
123 | /* 32 bit has nosave before _edata */ | 116 | /* 32 bit has nosave before _edata */ |
124 | . = ALIGN(PAGE_SIZE); | 117 | NOSAVE_DATA |
125 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
126 | __nosave_begin = .; | ||
127 | *(.data.nosave) | ||
128 | . = ALIGN(PAGE_SIZE); | ||
129 | __nosave_end = .; | ||
130 | } | ||
131 | #endif | 118 | #endif |
132 | 119 | ||
133 | . = ALIGN(PAGE_SIZE); | 120 | PAGE_ALIGNED_DATA(PAGE_SIZE) |
134 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
135 | *(.data.page_aligned) | ||
136 | *(.data.idt) | 121 | *(.data.idt) |
137 | } | ||
138 | 122 | ||
139 | #ifdef CONFIG_X86_32 | 123 | CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) |
140 | . = ALIGN(32); | ||
141 | #else | ||
142 | . = ALIGN(PAGE_SIZE); | ||
143 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
144 | #endif | ||
145 | .data.cacheline_aligned : | ||
146 | AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
147 | *(.data.cacheline_aligned) | ||
148 | } | ||
149 | 124 | ||
150 | /* rarely changed data like cpu maps */ | 125 | DATA_DATA |
151 | #ifdef CONFIG_X86_32 | 126 | CONSTRUCTORS |
152 | . = ALIGN(32); | 127 | |
153 | #else | 128 | /* rarely changed data like cpu maps */ |
154 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | 129 | READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) |
155 | #endif | ||
156 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
157 | *(.data.read_mostly) | ||
158 | 130 | ||
159 | #ifdef CONFIG_X86_32 | ||
160 | /* End of data section */ | 131 | /* End of data section */ |
161 | _edata = .; | 132 | _edata = .; |
162 | #endif | 133 | } :data |
163 | } | ||
164 | 134 | ||
165 | #ifdef CONFIG_X86_64 | 135 | #ifdef CONFIG_X86_64 |
166 | 136 | ||
167 | #define VSYSCALL_ADDR (-10*1024*1024) | 137 | #define VSYSCALL_ADDR (-10*1024*1024) |
168 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ | 138 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ |
169 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | 139 | PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) |
170 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ | 140 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ |
171 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | 141 | PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) |
172 | 142 | ||
173 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | 143 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) |
174 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | 144 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) |
@@ -234,35 +204,29 @@ SECTIONS | |||
234 | 204 | ||
235 | #endif /* CONFIG_X86_64 */ | 205 | #endif /* CONFIG_X86_64 */ |
236 | 206 | ||
237 | /* init_task */ | 207 | /* Init code and data - will be freed after init */ |
238 | . = ALIGN(THREAD_SIZE); | 208 | . = ALIGN(PAGE_SIZE); |
239 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | 209 | .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { |
240 | *(.data.init_task) | 210 | __init_begin = .; /* paired with __init_end */ |
241 | } | 211 | } |
242 | #ifdef CONFIG_X86_64 | ||
243 | :data.init | ||
244 | #endif | ||
245 | 212 | ||
213 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) | ||
246 | /* | 214 | /* |
247 | * smp_locks might be freed after init | 215 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the |
248 | * start/end must be page aligned | 216 | * output PHDR, so the next output section - .init.text - should |
217 | * start another segment - init. | ||
249 | */ | 218 | */ |
250 | . = ALIGN(PAGE_SIZE); | 219 | PERCPU_VADDR(0, :percpu) |
251 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | 220 | #endif |
252 | __smp_locks = .; | ||
253 | *(.smp_locks) | ||
254 | __smp_locks_end = .; | ||
255 | . = ALIGN(PAGE_SIZE); | ||
256 | } | ||
257 | 221 | ||
258 | /* Init code and data - will be freed after init */ | ||
259 | . = ALIGN(PAGE_SIZE); | ||
260 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | 222 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { |
261 | __init_begin = .; /* paired with __init_end */ | ||
262 | _sinittext = .; | 223 | _sinittext = .; |
263 | INIT_TEXT | 224 | INIT_TEXT |
264 | _einittext = .; | 225 | _einittext = .; |
265 | } | 226 | } |
227 | #ifdef CONFIG_X86_64 | ||
228 | :init | ||
229 | #endif | ||
266 | 230 | ||
267 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { | 231 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { |
268 | INIT_DATA | 232 | INIT_DATA |
@@ -333,17 +297,7 @@ SECTIONS | |||
333 | } | 297 | } |
334 | #endif | 298 | #endif |
335 | 299 | ||
336 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) | 300 | #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) |
337 | /* | ||
338 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the | ||
339 | * output PHDR, so the next output section - __data_nosave - should | ||
340 | * start another section data.init2. Also, pda should be at the head of | ||
341 | * percpu area. Preallocate it and define the percpu offset symbol | ||
342 | * so that it can be accessed as a percpu variable. | ||
343 | */ | ||
344 | . = ALIGN(PAGE_SIZE); | ||
345 | PERCPU_VADDR(0, :percpu) | ||
346 | #else | ||
347 | PERCPU(PAGE_SIZE) | 301 | PERCPU(PAGE_SIZE) |
348 | #endif | 302 | #endif |
349 | 303 | ||
@@ -354,15 +308,22 @@ SECTIONS | |||
354 | __init_end = .; | 308 | __init_end = .; |
355 | } | 309 | } |
356 | 310 | ||
311 | /* | ||
312 | * smp_locks might be freed after init | ||
313 | * start/end must be page aligned | ||
314 | */ | ||
315 | . = ALIGN(PAGE_SIZE); | ||
316 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
317 | __smp_locks = .; | ||
318 | *(.smp_locks) | ||
319 | __smp_locks_end = .; | ||
320 | . = ALIGN(PAGE_SIZE); | ||
321 | } | ||
322 | |||
357 | #ifdef CONFIG_X86_64 | 323 | #ifdef CONFIG_X86_64 |
358 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | 324 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { |
359 | . = ALIGN(PAGE_SIZE); | 325 | NOSAVE_DATA |
360 | __nosave_begin = .; | 326 | } |
361 | *(.data.nosave) | ||
362 | . = ALIGN(PAGE_SIZE); | ||
363 | __nosave_end = .; | ||
364 | } :data.init2 | ||
365 | /* use another section data.init2, see PERCPU_VADDR() above */ | ||
366 | #endif | 327 | #endif |
367 | 328 | ||
368 | /* BSS */ | 329 | /* BSS */ |
@@ -400,8 +361,8 @@ SECTIONS | |||
400 | 361 | ||
401 | 362 | ||
402 | #ifdef CONFIG_X86_32 | 363 | #ifdef CONFIG_X86_32 |
403 | ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), | 364 | . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), |
404 | "kernel image bigger than KERNEL_IMAGE_SIZE") | 365 | "kernel image bigger than KERNEL_IMAGE_SIZE"); |
405 | #else | 366 | #else |
406 | /* | 367 | /* |
407 | * Per-cpu symbols which need to be offset from __per_cpu_load | 368 | * Per-cpu symbols which need to be offset from __per_cpu_load |
@@ -414,12 +375,12 @@ INIT_PER_CPU(irq_stack_union); | |||
414 | /* | 375 | /* |
415 | * Build-time check on the image size: | 376 | * Build-time check on the image size: |
416 | */ | 377 | */ |
417 | ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), | 378 | . = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), |
418 | "kernel image bigger than KERNEL_IMAGE_SIZE") | 379 | "kernel image bigger than KERNEL_IMAGE_SIZE"); |
419 | 380 | ||
420 | #ifdef CONFIG_SMP | 381 | #ifdef CONFIG_SMP |
421 | ASSERT((per_cpu__irq_stack_union == 0), | 382 | . = ASSERT((per_cpu__irq_stack_union == 0), |
422 | "irq_stack_union is not at start of per-cpu area"); | 383 | "irq_stack_union is not at start of per-cpu area"); |
423 | #endif | 384 | #endif |
424 | 385 | ||
425 | #endif /* CONFIG_X86_32 */ | 386 | #endif /* CONFIG_X86_32 */ |
@@ -427,7 +388,7 @@ ASSERT((per_cpu__irq_stack_union == 0), | |||
427 | #ifdef CONFIG_KEXEC | 388 | #ifdef CONFIG_KEXEC |
428 | #include <asm/kexec.h> | 389 | #include <asm/kexec.h> |
429 | 390 | ||
430 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | 391 | . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, |
431 | "kexec control code size is too big") | 392 | "kexec control code size is too big"); |
432 | #endif | 393 | #endif |
433 | 394 | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4d6f0d293ee2..21f68e00524f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) | |||
104 | ktime_t remaining; | 104 | ktime_t remaining; |
105 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; | 105 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; |
106 | 106 | ||
107 | if (!ps->pit_timer.period) | ||
108 | return 0; | ||
109 | |||
107 | /* | 110 | /* |
108 | * The Counter does not stop when it reaches zero. In | 111 | * The Counter does not stop when it reaches zero. In |
109 | * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to | 112 | * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c3d6e81a7dc..0ef5bb2b4043 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) | |||
489 | * | 489 | * |
490 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | 490 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc |
491 | * containing more mappings. | 491 | * containing more mappings. |
492 | * | ||
493 | * Returns the number of rmap entries before the spte was added or zero if | ||
494 | * the spte was not added. | ||
495 | * | ||
492 | */ | 496 | */ |
493 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | 497 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) |
494 | { | 498 | { |
495 | struct kvm_mmu_page *sp; | 499 | struct kvm_mmu_page *sp; |
496 | struct kvm_rmap_desc *desc; | 500 | struct kvm_rmap_desc *desc; |
497 | unsigned long *rmapp; | 501 | unsigned long *rmapp; |
498 | int i; | 502 | int i, count = 0; |
499 | 503 | ||
500 | if (!is_rmap_pte(*spte)) | 504 | if (!is_rmap_pte(*spte)) |
501 | return; | 505 | return count; |
502 | gfn = unalias_gfn(vcpu->kvm, gfn); | 506 | gfn = unalias_gfn(vcpu->kvm, gfn); |
503 | sp = page_header(__pa(spte)); | 507 | sp = page_header(__pa(spte)); |
504 | sp->gfns[spte - sp->spt] = gfn; | 508 | sp->gfns[spte - sp->spt] = gfn; |
@@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | |||
515 | } else { | 519 | } else { |
516 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 520 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
517 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 521 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
518 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | 522 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { |
519 | desc = desc->more; | 523 | desc = desc->more; |
524 | count += RMAP_EXT; | ||
525 | } | ||
520 | if (desc->shadow_ptes[RMAP_EXT-1]) { | 526 | if (desc->shadow_ptes[RMAP_EXT-1]) { |
521 | desc->more = mmu_alloc_rmap_desc(vcpu); | 527 | desc->more = mmu_alloc_rmap_desc(vcpu); |
522 | desc = desc->more; | 528 | desc = desc->more; |
@@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | |||
525 | ; | 531 | ; |
526 | desc->shadow_ptes[i] = spte; | 532 | desc->shadow_ptes[i] = spte; |
527 | } | 533 | } |
534 | return count; | ||
528 | } | 535 | } |
529 | 536 | ||
530 | static void rmap_desc_remove_entry(unsigned long *rmapp, | 537 | static void rmap_desc_remove_entry(unsigned long *rmapp, |
@@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
754 | return young; | 761 | return young; |
755 | } | 762 | } |
756 | 763 | ||
764 | #define RMAP_RECYCLE_THRESHOLD 1000 | ||
765 | |||
766 | static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) | ||
767 | { | ||
768 | unsigned long *rmapp; | ||
769 | |||
770 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
771 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); | ||
772 | |||
773 | kvm_unmap_rmapp(vcpu->kvm, rmapp); | ||
774 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
775 | } | ||
776 | |||
757 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) | 777 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) |
758 | { | 778 | { |
759 | return kvm_handle_hva(kvm, hva, kvm_age_rmapp); | 779 | return kvm_handle_hva(kvm, hva, kvm_age_rmapp); |
@@ -1407,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1407 | */ | 1427 | */ |
1408 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | 1428 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) |
1409 | { | 1429 | { |
1430 | int used_pages; | ||
1431 | |||
1432 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; | ||
1433 | used_pages = max(0, used_pages); | ||
1434 | |||
1410 | /* | 1435 | /* |
1411 | * If we set the number of mmu pages to be smaller be than the | 1436 | * If we set the number of mmu pages to be smaller be than the |
1412 | * number of actived pages , we must to free some mmu pages before we | 1437 | * number of actived pages , we must to free some mmu pages before we |
1413 | * change the value | 1438 | * change the value |
1414 | */ | 1439 | */ |
1415 | 1440 | ||
1416 | if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > | 1441 | if (used_pages > kvm_nr_mmu_pages) { |
1417 | kvm_nr_mmu_pages) { | 1442 | while (used_pages > kvm_nr_mmu_pages) { |
1418 | int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages | ||
1419 | - kvm->arch.n_free_mmu_pages; | ||
1420 | |||
1421 | while (n_used_mmu_pages > kvm_nr_mmu_pages) { | ||
1422 | struct kvm_mmu_page *page; | 1443 | struct kvm_mmu_page *page; |
1423 | 1444 | ||
1424 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1445 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1425 | struct kvm_mmu_page, link); | 1446 | struct kvm_mmu_page, link); |
1426 | kvm_mmu_zap_page(kvm, page); | 1447 | kvm_mmu_zap_page(kvm, page); |
1427 | n_used_mmu_pages--; | 1448 | used_pages--; |
1428 | } | 1449 | } |
1429 | kvm->arch.n_free_mmu_pages = 0; | 1450 | kvm->arch.n_free_mmu_pages = 0; |
1430 | } | 1451 | } |
@@ -1740,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1740 | { | 1761 | { |
1741 | int was_rmapped = 0; | 1762 | int was_rmapped = 0; |
1742 | int was_writeble = is_writeble_pte(*shadow_pte); | 1763 | int was_writeble = is_writeble_pte(*shadow_pte); |
1764 | int rmap_count; | ||
1743 | 1765 | ||
1744 | pgprintk("%s: spte %llx access %x write_fault %d" | 1766 | pgprintk("%s: spte %llx access %x write_fault %d" |
1745 | " user_fault %d gfn %lx\n", | 1767 | " user_fault %d gfn %lx\n", |
@@ -1781,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1781 | 1803 | ||
1782 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1804 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
1783 | if (!was_rmapped) { | 1805 | if (!was_rmapped) { |
1784 | rmap_add(vcpu, shadow_pte, gfn, largepage); | 1806 | rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); |
1785 | if (!is_rmap_pte(*shadow_pte)) | 1807 | if (!is_rmap_pte(*shadow_pte)) |
1786 | kvm_release_pfn_clean(pfn); | 1808 | kvm_release_pfn_clean(pfn); |
1809 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | ||
1810 | rmap_recycle(vcpu, gfn, largepage); | ||
1787 | } else { | 1811 | } else { |
1788 | if (was_writeble) | 1812 | if (was_writeble) |
1789 | kvm_release_pfn_dirty(pfn); | 1813 | kvm_release_pfn_dirty(pfn); |
@@ -2157,7 +2181,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2157 | else | 2181 | else |
2158 | /* 32 bits PSE 4MB page */ | 2182 | /* 32 bits PSE 4MB page */ |
2159 | context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); | 2183 | context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); |
2160 | context->rsvd_bits_mask[1][0] = ~0ull; | 2184 | context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; |
2161 | break; | 2185 | break; |
2162 | case PT32E_ROOT_LEVEL: | 2186 | case PT32E_ROOT_LEVEL: |
2163 | context->rsvd_bits_mask[0][2] = | 2187 | context->rsvd_bits_mask[0][2] = |
@@ -2170,7 +2194,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2170 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | 2194 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | |
2171 | rsvd_bits(maxphyaddr, 62) | | 2195 | rsvd_bits(maxphyaddr, 62) | |
2172 | rsvd_bits(13, 20); /* large page */ | 2196 | rsvd_bits(13, 20); /* large page */ |
2173 | context->rsvd_bits_mask[1][0] = ~0ull; | 2197 | context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; |
2174 | break; | 2198 | break; |
2175 | case PT64_ROOT_LEVEL: | 2199 | case PT64_ROOT_LEVEL: |
2176 | context->rsvd_bits_mask[0][3] = exb_bit_rsvd | | 2200 | context->rsvd_bits_mask[0][3] = exb_bit_rsvd | |
@@ -2186,7 +2210,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2186 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | 2210 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | |
2187 | rsvd_bits(maxphyaddr, 51) | | 2211 | rsvd_bits(maxphyaddr, 51) | |
2188 | rsvd_bits(13, 20); /* large page */ | 2212 | rsvd_bits(13, 20); /* large page */ |
2189 | context->rsvd_bits_mask[1][0] = ~0ull; | 2213 | context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; |
2190 | break; | 2214 | break; |
2191 | } | 2215 | } |
2192 | } | 2216 | } |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 258e4591e1ca..67785f635399 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -281,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
281 | { | 281 | { |
282 | unsigned access = gw->pt_access; | 282 | unsigned access = gw->pt_access; |
283 | struct kvm_mmu_page *shadow_page; | 283 | struct kvm_mmu_page *shadow_page; |
284 | u64 spte, *sptep; | 284 | u64 spte, *sptep = NULL; |
285 | int direct; | 285 | int direct; |
286 | gfn_t table_gfn; | 286 | gfn_t table_gfn; |
287 | int r; | 287 | int r; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 71510e07e69e..b1f658ad2f06 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
711 | svm->vmcb->control.tsc_offset += delta; | 711 | svm->vmcb->control.tsc_offset += delta; |
712 | vcpu->cpu = cpu; | 712 | vcpu->cpu = cpu; |
713 | kvm_migrate_timers(vcpu); | 713 | kvm_migrate_timers(vcpu); |
714 | svm->asid_generation = 0; | ||
714 | } | 715 | } |
715 | 716 | ||
716 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 717 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
@@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) | |||
1031 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; | 1032 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; |
1032 | } | 1033 | } |
1033 | 1034 | ||
1034 | svm->vcpu.cpu = svm_data->cpu; | ||
1035 | svm->asid_generation = svm_data->asid_generation; | 1035 | svm->asid_generation = svm_data->asid_generation; |
1036 | svm->vmcb->control.asid = svm_data->next_asid++; | 1036 | svm->vmcb->control.asid = svm_data->next_asid++; |
1037 | } | 1037 | } |
@@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm) | |||
2300 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | 2300 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); |
2301 | 2301 | ||
2302 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | 2302 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; |
2303 | if (svm->vcpu.cpu != cpu || | 2303 | /* FIXME: handle wraparound of asid_generation */ |
2304 | svm->asid_generation != svm_data->asid_generation) | 2304 | if (svm->asid_generation != svm_data->asid_generation) |
2305 | new_asid(svm, svm_data); | 2305 | new_asid(svm, svm_data); |
2306 | } | 2306 | } |
2307 | 2307 | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e770bf349ec4..29f912927a58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -3012,6 +3012,12 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3012 | return 1; | 3012 | return 1; |
3013 | } | 3013 | } |
3014 | 3014 | ||
3015 | static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
3016 | { | ||
3017 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
3018 | return 1; | ||
3019 | } | ||
3020 | |||
3015 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3021 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3016 | { | 3022 | { |
3017 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3023 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3151,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
3151 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3157 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3152 | enum emulation_result err = EMULATE_DONE; | 3158 | enum emulation_result err = EMULATE_DONE; |
3153 | 3159 | ||
3154 | preempt_enable(); | ||
3155 | local_irq_enable(); | 3160 | local_irq_enable(); |
3161 | preempt_enable(); | ||
3156 | 3162 | ||
3157 | while (!guest_state_valid(vcpu)) { | 3163 | while (!guest_state_valid(vcpu)) { |
3158 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | 3164 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); |
@@ -3162,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
3162 | 3168 | ||
3163 | if (err != EMULATE_DONE) { | 3169 | if (err != EMULATE_DONE) { |
3164 | kvm_report_emulation_failure(vcpu, "emulation failure"); | 3170 | kvm_report_emulation_failure(vcpu, "emulation failure"); |
3165 | return; | 3171 | break; |
3166 | } | 3172 | } |
3167 | 3173 | ||
3168 | if (signal_pending(current)) | 3174 | if (signal_pending(current)) |
@@ -3171,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
3171 | schedule(); | 3177 | schedule(); |
3172 | } | 3178 | } |
3173 | 3179 | ||
3174 | local_irq_disable(); | ||
3175 | preempt_disable(); | 3180 | preempt_disable(); |
3181 | local_irq_disable(); | ||
3176 | 3182 | ||
3177 | vmx->invalid_state_emulation_result = err; | 3183 | vmx->invalid_state_emulation_result = err; |
3178 | } | 3184 | } |
@@ -3198,6 +3204,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
3198 | [EXIT_REASON_HLT] = handle_halt, | 3204 | [EXIT_REASON_HLT] = handle_halt, |
3199 | [EXIT_REASON_INVLPG] = handle_invlpg, | 3205 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3200 | [EXIT_REASON_VMCALL] = handle_vmcall, | 3206 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3207 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | ||
3208 | [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, | ||
3209 | [EXIT_REASON_VMPTRLD] = handle_vmx_insn, | ||
3210 | [EXIT_REASON_VMPTRST] = handle_vmx_insn, | ||
3211 | [EXIT_REASON_VMREAD] = handle_vmx_insn, | ||
3212 | [EXIT_REASON_VMRESUME] = handle_vmx_insn, | ||
3213 | [EXIT_REASON_VMWRITE] = handle_vmx_insn, | ||
3214 | [EXIT_REASON_VMOFF] = handle_vmx_insn, | ||
3215 | [EXIT_REASON_VMON] = handle_vmx_insn, | ||
3201 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 3216 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
3202 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 3217 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
3203 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 3218 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 249540f98513..3d4529011828 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr) | |||
704 | return false; | 704 | return false; |
705 | } | 705 | } |
706 | 706 | ||
707 | static bool valid_pat_type(unsigned t) | ||
708 | { | ||
709 | return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ | ||
710 | } | ||
711 | |||
712 | static bool valid_mtrr_type(unsigned t) | ||
713 | { | ||
714 | return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ | ||
715 | } | ||
716 | |||
717 | static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
718 | { | ||
719 | int i; | ||
720 | |||
721 | if (!msr_mtrr_valid(msr)) | ||
722 | return false; | ||
723 | |||
724 | if (msr == MSR_IA32_CR_PAT) { | ||
725 | for (i = 0; i < 8; i++) | ||
726 | if (!valid_pat_type((data >> (i * 8)) & 0xff)) | ||
727 | return false; | ||
728 | return true; | ||
729 | } else if (msr == MSR_MTRRdefType) { | ||
730 | if (data & ~0xcff) | ||
731 | return false; | ||
732 | return valid_mtrr_type(data & 0xff); | ||
733 | } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { | ||
734 | for (i = 0; i < 8 ; i++) | ||
735 | if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) | ||
736 | return false; | ||
737 | return true; | ||
738 | } | ||
739 | |||
740 | /* variable MTRRs */ | ||
741 | return valid_mtrr_type(data & 0xff); | ||
742 | } | ||
743 | |||
707 | static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 744 | static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
708 | { | 745 | { |
709 | u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; | 746 | u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; |
710 | 747 | ||
711 | if (!msr_mtrr_valid(msr)) | 748 | if (!mtrr_valid(vcpu, msr, data)) |
712 | return 1; | 749 | return 1; |
713 | 750 | ||
714 | if (msr == MSR_MTRRdefType) { | 751 | if (msr == MSR_MTRRdefType) { |
@@ -898,6 +935,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
898 | case MSR_VM_HSAVE_PA: | 935 | case MSR_VM_HSAVE_PA: |
899 | case MSR_P6_EVNTSEL0: | 936 | case MSR_P6_EVNTSEL0: |
900 | case MSR_P6_EVNTSEL1: | 937 | case MSR_P6_EVNTSEL1: |
938 | case MSR_K7_EVNTSEL0: | ||
901 | data = 0; | 939 | data = 0; |
902 | break; | 940 | break; |
903 | case MSR_MTRRcap: | 941 | case MSR_MTRRcap: |
@@ -1078,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp, | |||
1078 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | 1116 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) |
1079 | goto out; | 1117 | goto out; |
1080 | r = -E2BIG; | 1118 | r = -E2BIG; |
1081 | if (n < num_msrs_to_save) | 1119 | if (n < msr_list.nmsrs) |
1082 | goto out; | 1120 | goto out; |
1083 | r = -EFAULT; | 1121 | r = -EFAULT; |
1084 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | 1122 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, |
1085 | num_msrs_to_save * sizeof(u32))) | 1123 | num_msrs_to_save * sizeof(u32))) |
1086 | goto out; | 1124 | goto out; |
1087 | if (copy_to_user(user_msr_list->indices | 1125 | if (copy_to_user(user_msr_list->indices + num_msrs_to_save, |
1088 | + num_msrs_to_save * sizeof(u32), | ||
1089 | &emulated_msrs, | 1126 | &emulated_msrs, |
1090 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | 1127 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) |
1091 | goto out; | 1128 | goto out; |
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index c1b6c232e02b..616de4628d60 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -1361,7 +1361,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1361 | return 0; | 1361 | return 0; |
1362 | } | 1362 | } |
1363 | 1363 | ||
1364 | void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) | 1364 | static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) |
1365 | { | 1365 | { |
1366 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); | 1366 | u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); |
1367 | /* | 1367 | /* |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7bc65f0f62c4..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -22,7 +22,8 @@ | |||
22 | * | 22 | * |
23 | * So how does the kernel know it's a Guest? We'll see that later, but let's | 23 | * So how does the kernel know it's a Guest? We'll see that later, but let's |
24 | * just say that we end up here where we replace the native functions various | 24 | * just say that we end up here where we replace the native functions various |
25 | * "paravirt" structures with our Guest versions, then boot like normal. :*/ | 25 | * "paravirt" structures with our Guest versions, then boot like normal. |
26 | :*/ | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | 29 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. |
@@ -74,7 +75,8 @@ | |||
74 | * | 75 | * |
75 | * The Guest in our tale is a simple creature: identical to the Host but | 76 | * The Guest in our tale is a simple creature: identical to the Host but |
76 | * behaving in simplified but equivalent ways. In particular, the Guest is the | 77 | * behaving in simplified but equivalent ways. In particular, the Guest is the |
77 | * same kernel as the Host (or at least, built from the same source code). :*/ | 78 | * same kernel as the Host (or at least, built from the same source code). |
79 | :*/ | ||
78 | 80 | ||
79 | struct lguest_data lguest_data = { | 81 | struct lguest_data lguest_data = { |
80 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, | 82 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, |
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = { | |||
85 | .syscall_vec = SYSCALL_VECTOR, | 87 | .syscall_vec = SYSCALL_VECTOR, |
86 | }; | 88 | }; |
87 | 89 | ||
88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 90 | /*G:037 |
91 | * async_hcall() is pretty simple: I'm quite proud of it really. We have a | ||
89 | * ring buffer of stored hypercalls which the Host will run though next time we | 92 | * ring buffer of stored hypercalls which the Host will run though next time we |
90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall | 93 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 94 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = { | |||
94 | * If we come around to a slot which hasn't been finished, then the table is | 97 | * If we come around to a slot which hasn't been finished, then the table is |
95 | * full and we just make the hypercall directly. This has the nice side | 98 | * full and we just make the hypercall directly. This has the nice side |
96 | * effect of causing the Host to run all the stored calls in the ring buffer | 99 | * effect of causing the Host to run all the stored calls in the ring buffer |
97 | * which empties it for next time! */ | 100 | * which empties it for next time! |
101 | */ | ||
98 | static void async_hcall(unsigned long call, unsigned long arg1, | 102 | static void async_hcall(unsigned long call, unsigned long arg1, |
99 | unsigned long arg2, unsigned long arg3, | 103 | unsigned long arg2, unsigned long arg3, |
100 | unsigned long arg4) | 104 | unsigned long arg4) |
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
103 | static unsigned int next_call; | 107 | static unsigned int next_call; |
104 | unsigned long flags; | 108 | unsigned long flags; |
105 | 109 | ||
106 | /* Disable interrupts if not already disabled: we don't want an | 110 | /* |
111 | * Disable interrupts if not already disabled: we don't want an | ||
107 | * interrupt handler making a hypercall while we're already doing | 112 | * interrupt handler making a hypercall while we're already doing |
108 | * one! */ | 113 | * one! |
114 | */ | ||
109 | local_irq_save(flags); | 115 | local_irq_save(flags); |
110 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 116 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
111 | /* Table full, so do normal hcall which will flush table. */ | 117 | /* Table full, so do normal hcall which will flush table. */ |
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
125 | local_irq_restore(flags); | 131 | local_irq_restore(flags); |
126 | } | 132 | } |
127 | 133 | ||
128 | /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first | 134 | /*G:035 |
129 | * real optimization trick! | 135 | * Notice the lazy_hcall() above, rather than hcall(). This is our first real |
136 | * optimization trick! | ||
130 | * | 137 | * |
131 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do | 138 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do |
132 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls | 139 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls |
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
136 | * lguest_leave_lazy_mode(). | 143 | * lguest_leave_lazy_mode(). |
137 | * | 144 | * |
138 | * So, when we're in lazy mode, we call async_hcall() to store the call for | 145 | * So, when we're in lazy mode, we call async_hcall() to store the call for |
139 | * future processing: */ | 146 | * future processing: |
147 | */ | ||
140 | static void lazy_hcall1(unsigned long call, | 148 | static void lazy_hcall1(unsigned long call, |
141 | unsigned long arg1) | 149 | unsigned long arg1) |
142 | { | 150 | { |
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call, | |||
146 | async_hcall(call, arg1, 0, 0, 0); | 154 | async_hcall(call, arg1, 0, 0, 0); |
147 | } | 155 | } |
148 | 156 | ||
157 | /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ | ||
149 | static void lazy_hcall2(unsigned long call, | 158 | static void lazy_hcall2(unsigned long call, |
150 | unsigned long arg1, | 159 | unsigned long arg1, |
151 | unsigned long arg2) | 160 | unsigned long arg2) |
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call, | |||
181 | } | 190 | } |
182 | #endif | 191 | #endif |
183 | 192 | ||
184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 193 | /*G:036 |
185 | * issue the do-nothing hypercall to flush any stored calls. */ | 194 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then |
195 | * issue the do-nothing hypercall to flush any stored calls. | ||
196 | :*/ | ||
186 | static void lguest_leave_lazy_mmu_mode(void) | 197 | static void lguest_leave_lazy_mmu_mode(void) |
187 | { | 198 | { |
188 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 199 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
208 | * check there before it tries to deliver an interrupt. | 219 | * check there before it tries to deliver an interrupt. |
209 | */ | 220 | */ |
210 | 221 | ||
211 | /* save_flags() is expected to return the processor state (ie. "flags"). The | 222 | /* |
223 | * save_flags() is expected to return the processor state (ie. "flags"). The | ||
212 | * flags word contains all kind of stuff, but in practice Linux only cares | 224 | * flags word contains all kind of stuff, but in practice Linux only cares |
213 | * about the interrupt flag. Our "save_flags()" just returns that. */ | 225 | * about the interrupt flag. Our "save_flags()" just returns that. |
226 | */ | ||
214 | static unsigned long save_fl(void) | 227 | static unsigned long save_fl(void) |
215 | { | 228 | { |
216 | return lguest_data.irq_enabled; | 229 | return lguest_data.irq_enabled; |
@@ -222,13 +235,15 @@ static void irq_disable(void) | |||
222 | lguest_data.irq_enabled = 0; | 235 | lguest_data.irq_enabled = 0; |
223 | } | 236 | } |
224 | 237 | ||
225 | /* Let's pause a moment. Remember how I said these are called so often? | 238 | /* |
239 | * Let's pause a moment. Remember how I said these are called so often? | ||
226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | 240 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to |
227 | * break some rules. In particular, these functions are assumed to save their | 241 | * break some rules. In particular, these functions are assumed to save their |
228 | * own registers if they need to: normal C functions assume they can trash the | 242 | * own registers if they need to: normal C functions assume they can trash the |
229 | * eax register. To use normal C functions, we use | 243 | * eax register. To use normal C functions, we use |
230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | 244 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the |
231 | * C function, then restores it. */ | 245 | * C function, then restores it. |
246 | */ | ||
232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | 247 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); |
233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 248 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
234 | /*:*/ | 249 | /*:*/ |
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | |||
237 | extern void lg_irq_enable(void); | 252 | extern void lg_irq_enable(void); |
238 | extern void lg_restore_fl(unsigned long flags); | 253 | extern void lg_restore_fl(unsigned long flags); |
239 | 254 | ||
240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 255 | /*M:003 |
241 | * them (or when we unmask an interrupt). This seems to work for the moment, | 256 | * We could be more efficient in our checking of outstanding interrupts, rather |
242 | * since interrupts are rare and we'll just get the interrupt on the next timer | 257 | * than using a branch. One way would be to put the "irq_enabled" field in a |
243 | * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way | 258 | * page by itself, and have the Host write-protect it when an interrupt comes |
244 | * would be to put the "irq_enabled" field in a page by itself, and have the | 259 | * in when irqs are disabled. There will then be a page fault as soon as |
245 | * Host write-protect it when an interrupt comes in when irqs are disabled. | 260 | * interrupts are re-enabled. |
246 | * There will then be a page fault as soon as interrupts are re-enabled. | ||
247 | * | 261 | * |
248 | * A better method is to implement soft interrupt disable generally for x86: | 262 | * A better method is to implement soft interrupt disable generally for x86: |
249 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 263 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
250 | * in, we then disable them for real. This is uncommon, so we could simply use | 264 | * in, we then disable them for real. This is uncommon, so we could simply use |
251 | * a hypercall for interrupt control and not worry about efficiency. :*/ | 265 | * a hypercall for interrupt control and not worry about efficiency. |
266 | :*/ | ||
252 | 267 | ||
253 | /*G:034 | 268 | /*G:034 |
254 | * The Interrupt Descriptor Table (IDT). | 269 | * The Interrupt Descriptor Table (IDT). |
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags); | |||
261 | static void lguest_write_idt_entry(gate_desc *dt, | 276 | static void lguest_write_idt_entry(gate_desc *dt, |
262 | int entrynum, const gate_desc *g) | 277 | int entrynum, const gate_desc *g) |
263 | { | 278 | { |
264 | /* The gate_desc structure is 8 bytes long: we hand it to the Host in | 279 | /* |
280 | * The gate_desc structure is 8 bytes long: we hand it to the Host in | ||
265 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors | 281 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors |
266 | * around like this; typesafety wasn't a big concern in Linux's early | 282 | * around like this; typesafety wasn't a big concern in Linux's early |
267 | * years. */ | 283 | * years. |
284 | */ | ||
268 | u32 *desc = (u32 *)g; | 285 | u32 *desc = (u32 *)g; |
269 | /* Keep the local copy up to date. */ | 286 | /* Keep the local copy up to date. */ |
270 | native_write_idt_entry(dt, entrynum, g); | 287 | native_write_idt_entry(dt, entrynum, g); |
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt, | |||
272 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); | 289 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); |
273 | } | 290 | } |
274 | 291 | ||
275 | /* Changing to a different IDT is very rare: we keep the IDT up-to-date every | 292 | /* |
293 | * Changing to a different IDT is very rare: we keep the IDT up-to-date every | ||
276 | * time it is written, so we can simply loop through all entries and tell the | 294 | * time it is written, so we can simply loop through all entries and tell the |
277 | * Host about them. */ | 295 | * Host about them. |
296 | */ | ||
278 | static void lguest_load_idt(const struct desc_ptr *desc) | 297 | static void lguest_load_idt(const struct desc_ptr *desc) |
279 | { | 298 | { |
280 | unsigned int i; | 299 | unsigned int i; |
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) | |||
305 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); | 324 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); |
306 | } | 325 | } |
307 | 326 | ||
308 | /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, | 327 | /* |
328 | * For a single GDT entry which changes, we do the lazy thing: alter our GDT, | ||
309 | * then tell the Host to reload the entire thing. This operation is so rare | 329 | * then tell the Host to reload the entire thing. This operation is so rare |
310 | * that this naive implementation is reasonable. */ | 330 | * that this naive implementation is reasonable. |
331 | */ | ||
311 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | 332 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, |
312 | const void *desc, int type) | 333 | const void *desc, int type) |
313 | { | 334 | { |
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | |||
317 | dt[entrynum].a, dt[entrynum].b); | 338 | dt[entrynum].a, dt[entrynum].b); |
318 | } | 339 | } |
319 | 340 | ||
320 | /* OK, I lied. There are three "thread local storage" GDT entries which change | 341 | /* |
342 | * OK, I lied. There are three "thread local storage" GDT entries which change | ||
321 | * on every context switch (these three entries are how glibc implements | 343 | * on every context switch (these three entries are how glibc implements |
322 | * __thread variables). So we have a hypercall specifically for this case. */ | 344 | * __thread variables). So we have a hypercall specifically for this case. |
345 | */ | ||
323 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | 346 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) |
324 | { | 347 | { |
325 | /* There's one problem which normal hardware doesn't have: the Host | 348 | /* |
349 | * There's one problem which normal hardware doesn't have: the Host | ||
326 | * can't handle us removing entries we're currently using. So we clear | 350 | * can't handle us removing entries we're currently using. So we clear |
327 | * the GS register here: if it's needed it'll be reloaded anyway. */ | 351 | * the GS register here: if it's needed it'll be reloaded anyway. |
352 | */ | ||
328 | lazy_load_gs(0); | 353 | lazy_load_gs(0); |
329 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); | 354 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); |
330 | } | 355 | } |
331 | 356 | ||
332 | /*G:038 That's enough excitement for now, back to ploughing through each of | 357 | /*G:038 |
333 | * the different pv_ops structures (we're about 1/3 of the way through). | 358 | * That's enough excitement for now, back to ploughing through each of the |
359 | * different pv_ops structures (we're about 1/3 of the way through). | ||
334 | * | 360 | * |
335 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only | 361 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only |
336 | * uses this for some strange applications like Wine. We don't do anything | 362 | * uses this for some strange applications like Wine. We don't do anything |
337 | * here, so they'll get an informative and friendly Segmentation Fault. */ | 363 | * here, so they'll get an informative and friendly Segmentation Fault. |
364 | */ | ||
338 | static void lguest_set_ldt(const void *addr, unsigned entries) | 365 | static void lguest_set_ldt(const void *addr, unsigned entries) |
339 | { | 366 | { |
340 | } | 367 | } |
341 | 368 | ||
342 | /* This loads a GDT entry into the "Task Register": that entry points to a | 369 | /* |
370 | * This loads a GDT entry into the "Task Register": that entry points to a | ||
343 | * structure called the Task State Segment. Some comments scattered though the | 371 | * structure called the Task State Segment. Some comments scattered though the |
344 | * kernel code indicate that this used for task switching in ages past, along | 372 | * kernel code indicate that this used for task switching in ages past, along |
345 | * with blood sacrifice and astrology. | 373 | * with blood sacrifice and astrology. |
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) | |||
347 | * Now there's nothing interesting in here that we don't get told elsewhere. | 375 | * Now there's nothing interesting in here that we don't get told elsewhere. |
348 | * But the native version uses the "ltr" instruction, which makes the Host | 376 | * But the native version uses the "ltr" instruction, which makes the Host |
349 | * complain to the Guest about a Segmentation Fault and it'll oops. So we | 377 | * complain to the Guest about a Segmentation Fault and it'll oops. So we |
350 | * override the native version with a do-nothing version. */ | 378 | * override the native version with a do-nothing version. |
379 | */ | ||
351 | static void lguest_load_tr_desc(void) | 380 | static void lguest_load_tr_desc(void) |
352 | { | 381 | { |
353 | } | 382 | } |
354 | 383 | ||
355 | /* The "cpuid" instruction is a way of querying both the CPU identity | 384 | /* |
385 | * The "cpuid" instruction is a way of querying both the CPU identity | ||
356 | * (manufacturer, model, etc) and its features. It was introduced before the | 386 | * (manufacturer, model, etc) and its features. It was introduced before the |
357 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. | 387 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. |
358 | * As you might imagine, after a decade and a half this treatment, it is now a | 388 | * As you might imagine, after a decade and a half this treatment, it is now a |
359 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. | 389 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. |
360 | * | 390 | * |
361 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | 391 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry |
362 | * has been translated into 4 languages. I am not making this up! | 392 | * has been translated into 5 languages. I am not making this up! |
363 | * | 393 | * |
364 | * We could get funky here and identify ourselves as "GenuineLguest", but | 394 | * We could get funky here and identify ourselves as "GenuineLguest", but |
365 | * instead we just use the real "cpuid" instruction. Then I pretty much turned | 395 | * instead we just use the real "cpuid" instruction. Then I pretty much turned |
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void) | |||
371 | * Replacing the cpuid so we can turn features off is great for the kernel, but | 401 | * Replacing the cpuid so we can turn features off is great for the kernel, but |
372 | * anyone (including userspace) can just use the raw "cpuid" instruction and | 402 | * anyone (including userspace) can just use the raw "cpuid" instruction and |
373 | * the Host won't even notice since it isn't privileged. So we try not to get | 403 | * the Host won't even notice since it isn't privileged. So we try not to get |
374 | * too worked up about it. */ | 404 | * too worked up about it. |
405 | */ | ||
375 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | 406 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, |
376 | unsigned int *cx, unsigned int *dx) | 407 | unsigned int *cx, unsigned int *dx) |
377 | { | 408 | { |
@@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
379 | 410 | ||
380 | native_cpuid(ax, bx, cx, dx); | 411 | native_cpuid(ax, bx, cx, dx); |
381 | switch (function) { | 412 | switch (function) { |
382 | case 1: /* Basic feature request. */ | 413 | /* |
383 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 414 | * CPUID 0 gives the highest legal CPUID number (and the ID string). |
415 | * We futureproof our code a little by sticking to known CPUID values. | ||
416 | */ | ||
417 | case 0: | ||
418 | if (*ax > 5) | ||
419 | *ax = 5; | ||
420 | break; | ||
421 | |||
422 | /* | ||
423 | * CPUID 1 is a basic feature request. | ||
424 | * | ||
425 | * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 | ||
426 | * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. | ||
427 | */ | ||
428 | case 1: | ||
384 | *cx &= 0x00002201; | 429 | *cx &= 0x00002201; |
385 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ | ||
386 | *dx &= 0x07808151; | 430 | *dx &= 0x07808151; |
387 | /* The Host can do a nice optimization if it knows that the | 431 | /* |
432 | * The Host can do a nice optimization if it knows that the | ||
388 | * kernel mappings (addresses above 0xC0000000 or whatever | 433 | * kernel mappings (addresses above 0xC0000000 or whatever |
389 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 434 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
390 | * flush_tlb_user() for both user and kernel mappings unless | 435 | * flush_tlb_user() for both user and kernel mappings unless |
391 | * the Page Global Enable (PGE) feature bit is set. */ | 436 | * the Page Global Enable (PGE) feature bit is set. |
437 | */ | ||
392 | *dx |= 0x00002000; | 438 | *dx |= 0x00002000; |
393 | /* We also lie, and say we're family id 5. 6 or greater | 439 | /* |
440 | * We also lie, and say we're family id 5. 6 or greater | ||
394 | * leads to a rdmsr in early_init_intel which we can't handle. | 441 | * leads to a rdmsr in early_init_intel which we can't handle. |
395 | * Family ID is returned as bits 8-12 in ax. */ | 442 | * Family ID is returned as bits 8-12 in ax. |
443 | */ | ||
396 | *ax &= 0xFFFFF0FF; | 444 | *ax &= 0xFFFFF0FF; |
397 | *ax |= 0x00000500; | 445 | *ax |= 0x00000500; |
398 | break; | 446 | break; |
447 | /* | ||
448 | * 0x80000000 returns the highest Extended Function, so we futureproof | ||
449 | * like we do above by limiting it to known fields. | ||
450 | */ | ||
399 | case 0x80000000: | 451 | case 0x80000000: |
400 | /* Futureproof this a little: if they ask how much extended | ||
401 | * processor information there is, limit it to known fields. */ | ||
402 | if (*ax > 0x80000008) | 452 | if (*ax > 0x80000008) |
403 | *ax = 0x80000008; | 453 | *ax = 0x80000008; |
404 | break; | 454 | break; |
455 | |||
456 | /* | ||
457 | * PAE systems can mark pages as non-executable. Linux calls this the | ||
458 | * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced | ||
459 | * Virus Protection). We just switch turn if off here, since we don't | ||
460 | * support it. | ||
461 | */ | ||
405 | case 0x80000001: | 462 | case 0x80000001: |
406 | /* Here we should fix nx cap depending on host. */ | ||
407 | /* For this version of PAE, we just clear NX bit. */ | ||
408 | *dx &= ~(1 << 20); | 463 | *dx &= ~(1 << 20); |
409 | break; | 464 | break; |
410 | } | 465 | } |
411 | } | 466 | } |
412 | 467 | ||
413 | /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | 468 | /* |
469 | * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | ||
414 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother | 470 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother |
415 | * it. The Host needs to know when the Guest wants to change them, so we have | 471 | * it. The Host needs to know when the Guest wants to change them, so we have |
416 | * a whole series of functions like read_cr0() and write_cr0(). | 472 | * a whole series of functions like read_cr0() and write_cr0(). |
@@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
425 | * name like "FPUTRAP bit" be a little less cryptic? | 481 | * name like "FPUTRAP bit" be a little less cryptic? |
426 | * | 482 | * |
427 | * We store cr0 locally because the Host never changes it. The Guest sometimes | 483 | * We store cr0 locally because the Host never changes it. The Guest sometimes |
428 | * wants to read it and we'd prefer not to bother the Host unnecessarily. */ | 484 | * wants to read it and we'd prefer not to bother the Host unnecessarily. |
485 | */ | ||
429 | static unsigned long current_cr0; | 486 | static unsigned long current_cr0; |
430 | static void lguest_write_cr0(unsigned long val) | 487 | static void lguest_write_cr0(unsigned long val) |
431 | { | 488 | { |
@@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void) | |||
438 | return current_cr0; | 495 | return current_cr0; |
439 | } | 496 | } |
440 | 497 | ||
441 | /* Intel provided a special instruction to clear the TS bit for people too cool | 498 | /* |
499 | * Intel provided a special instruction to clear the TS bit for people too cool | ||
442 | * to use write_cr0() to do it. This "clts" instruction is faster, because all | 500 | * to use write_cr0() to do it. This "clts" instruction is faster, because all |
443 | * the vowels have been optimized out. */ | 501 | * the vowels have been optimized out. |
502 | */ | ||
444 | static void lguest_clts(void) | 503 | static void lguest_clts(void) |
445 | { | 504 | { |
446 | lazy_hcall1(LHCALL_TS, 0); | 505 | lazy_hcall1(LHCALL_TS, 0); |
447 | current_cr0 &= ~X86_CR0_TS; | 506 | current_cr0 &= ~X86_CR0_TS; |
448 | } | 507 | } |
449 | 508 | ||
450 | /* cr2 is the virtual address of the last page fault, which the Guest only ever | 509 | /* |
510 | * cr2 is the virtual address of the last page fault, which the Guest only ever | ||
451 | * reads. The Host kindly writes this into our "struct lguest_data", so we | 511 | * reads. The Host kindly writes this into our "struct lguest_data", so we |
452 | * just read it out of there. */ | 512 | * just read it out of there. |
513 | */ | ||
453 | static unsigned long lguest_read_cr2(void) | 514 | static unsigned long lguest_read_cr2(void) |
454 | { | 515 | { |
455 | return lguest_data.cr2; | 516 | return lguest_data.cr2; |
@@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void) | |||
458 | /* See lguest_set_pte() below. */ | 519 | /* See lguest_set_pte() below. */ |
459 | static bool cr3_changed = false; | 520 | static bool cr3_changed = false; |
460 | 521 | ||
461 | /* cr3 is the current toplevel pagetable page: the principle is the same as | 522 | /* |
523 | * cr3 is the current toplevel pagetable page: the principle is the same as | ||
462 | * cr0. Keep a local copy, and tell the Host when it changes. The only | 524 | * cr0. Keep a local copy, and tell the Host when it changes. The only |
463 | * difference is that our local copy is in lguest_data because the Host needs | 525 | * difference is that our local copy is in lguest_data because the Host needs |
464 | * to set it upon our initial hypercall. */ | 526 | * to set it upon our initial hypercall. |
527 | */ | ||
465 | static void lguest_write_cr3(unsigned long cr3) | 528 | static void lguest_write_cr3(unsigned long cr3) |
466 | { | 529 | { |
467 | lguest_data.pgdir = cr3; | 530 | lguest_data.pgdir = cr3; |
@@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) | |||
506 | * cr3 ---> +---------+ | 569 | * cr3 ---> +---------+ |
507 | * | --------->+---------+ | 570 | * | --------->+---------+ |
508 | * | | | PADDR1 | | 571 | * | | | PADDR1 | |
509 | * Top-level | | PADDR2 | | 572 | * Mid-level | | PADDR2 | |
510 | * (PMD) page | | | | 573 | * (PMD) page | | | |
511 | * | | Lower-level | | 574 | * | | Lower-level | |
512 | * | | (PTE) page | | 575 | * | | (PTE) page | |
@@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val) | |||
526 | * Index into top Index into second Offset within page | 589 | * Index into top Index into second Offset within page |
527 | * page directory page pagetable page | 590 | * page directory page pagetable page |
528 | * | 591 | * |
529 | * The kernel spends a lot of time changing both the top-level page directory | 592 | * Now, unfortunately, this isn't the whole story: Intel added Physical Address |
530 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | 593 | * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). |
531 | * so while it maintains these page tables exactly like normal, it also needs | 594 | * These are held in 64-bit page table entries, so we can now only fit 512 |
532 | * to keep the Host informed whenever it makes a change: the Host will create | 595 | * entries in a page, and the neat three-level tree breaks down. |
533 | * the real page tables based on the Guests'. | 596 | * |
597 | * The result is a four level page table: | ||
598 | * | ||
599 | * cr3 --> [ 4 Upper ] | ||
600 | * [ Level ] | ||
601 | * [ Entries ] | ||
602 | * [(PUD Page)]---> +---------+ | ||
603 | * | --------->+---------+ | ||
604 | * | | | PADDR1 | | ||
605 | * Mid-level | | PADDR2 | | ||
606 | * (PMD) page | | | | ||
607 | * | | Lower-level | | ||
608 | * | | (PTE) page | | ||
609 | * | | | | | ||
610 | * .... .... | ||
611 | * | ||
612 | * | ||
613 | * And the virtual address is decoded as: | ||
614 | * | ||
615 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| | ||
617 | * Index into Index into mid Index into lower Offset within page | ||
618 | * top entries directory page pagetable page | ||
619 | * | ||
620 | * It's too hard to switch between these two formats at runtime, so Linux only | ||
621 | * supports one or the other depending on whether CONFIG_X86_PAE is set. Many | ||
622 | * distributions turn it on, and not just for people with silly amounts of | ||
623 | * memory: the larger PTE entries allow room for the NX bit, which lets the | ||
624 | * kernel disable execution of pages and increase security. | ||
625 | * | ||
626 | * This was a problem for lguest, which couldn't run on these distributions; | ||
627 | * then Matias Zabaljauregui figured it all out and implemented it, and only a | ||
628 | * handful of puppies were crushed in the process! | ||
629 | * | ||
630 | * Back to our point: the kernel spends a lot of time changing both the | ||
631 | * top-level page directory and lower-level pagetable pages. The Guest doesn't | ||
632 | * know physical addresses, so while it maintains these page tables exactly | ||
633 | * like normal, it also needs to keep the Host informed whenever it makes a | ||
634 | * change: the Host will create the real page tables based on the Guests'. | ||
534 | */ | 635 | */ |
535 | 636 | ||
536 | /* The Guest calls this to set a second-level entry (pte), ie. to map a page | 637 | /* |
537 | * into a process' address space. We set the entry then tell the Host the | 638 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
538 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 639 | * a page into a process' address space. Wetell the Host the toplevel and |
539 | * process, so we need to tell the Host which one we're changing (mm->pgd). */ | 640 | * address this corresponds to. The Guest uses one pagetable per process, so |
641 | * we need to tell the Host which one we're changing (mm->pgd). | ||
642 | */ | ||
540 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 643 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
541 | pte_t *ptep) | 644 | pte_t *ptep) |
542 | { | 645 | { |
543 | #ifdef CONFIG_X86_PAE | 646 | #ifdef CONFIG_X86_PAE |
647 | /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ | ||
544 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | 648 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, |
545 | ptep->pte_low, ptep->pte_high); | 649 | ptep->pte_low, ptep->pte_high); |
546 | #else | 650 | #else |
@@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | |||
548 | #endif | 652 | #endif |
549 | } | 653 | } |
550 | 654 | ||
655 | /* This is the "set and update" combo-meal-deal version. */ | ||
551 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 656 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
552 | pte_t *ptep, pte_t pteval) | 657 | pte_t *ptep, pte_t pteval) |
553 | { | 658 | { |
@@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
555 | lguest_pte_update(mm, addr, ptep); | 660 | lguest_pte_update(mm, addr, ptep); |
556 | } | 661 | } |
557 | 662 | ||
558 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | 663 | /* |
664 | * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | ||
559 | * to set a middle-level entry when PAE is activated. | 665 | * to set a middle-level entry when PAE is activated. |
666 | * | ||
560 | * Again, we set the entry then tell the Host which page we changed, | 667 | * Again, we set the entry then tell the Host which page we changed, |
561 | * and the index of the entry we changed. */ | 668 | * and the index of the entry we changed. |
669 | */ | ||
562 | #ifdef CONFIG_X86_PAE | 670 | #ifdef CONFIG_X86_PAE |
563 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | 671 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) |
564 | { | 672 | { |
@@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
577 | } | 685 | } |
578 | #else | 686 | #else |
579 | 687 | ||
580 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | 688 | /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ |
581 | * activated. */ | ||
582 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 689 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
583 | { | 690 | { |
584 | native_set_pmd(pmdp, pmdval); | 691 | native_set_pmd(pmdp, pmdval); |
@@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
587 | } | 694 | } |
588 | #endif | 695 | #endif |
589 | 696 | ||
590 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 697 | /* |
698 | * There are a couple of legacy places where the kernel sets a PTE, but we | ||
591 | * don't know the top level any more. This is useless for us, since we don't | 699 | * don't know the top level any more. This is useless for us, since we don't |
592 | * know which pagetable is changing or what address, so we just tell the Host | 700 | * know which pagetable is changing or what address, so we just tell the Host |
593 | * to forget all of them. Fortunately, this is very rare. | 701 | * to forget all of them. Fortunately, this is very rare. |
@@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
595 | * ... except in early boot when the kernel sets up the initial pagetables, | 703 | * ... except in early boot when the kernel sets up the initial pagetables, |
596 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell | 704 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell |
597 | * the Host anything changed until we've done the first page table switch, | 705 | * the Host anything changed until we've done the first page table switch, |
598 | * which brings boot back to 0.25 seconds. */ | 706 | * which brings boot back to 0.25 seconds. |
707 | */ | ||
599 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 708 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
600 | { | 709 | { |
601 | native_set_pte(ptep, pteval); | 710 | native_set_pte(ptep, pteval); |
@@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
604 | } | 713 | } |
605 | 714 | ||
606 | #ifdef CONFIG_X86_PAE | 715 | #ifdef CONFIG_X86_PAE |
716 | /* | ||
717 | * With 64-bit PTE values, we need to be careful setting them: if we set 32 | ||
718 | * bits at a time, the hardware could see a weird half-set entry. These | ||
719 | * versions ensure we update all 64 bits at once. | ||
720 | */ | ||
607 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | 721 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) |
608 | { | 722 | { |
609 | native_set_pte_atomic(ptep, pte); | 723 | native_set_pte_atomic(ptep, pte); |
@@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
611 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 725 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
612 | } | 726 | } |
613 | 727 | ||
614 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 728 | static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, |
729 | pte_t *ptep) | ||
615 | { | 730 | { |
616 | native_pte_clear(mm, addr, ptep); | 731 | native_pte_clear(mm, addr, ptep); |
617 | lguest_pte_update(mm, addr, ptep); | 732 | lguest_pte_update(mm, addr, ptep); |
618 | } | 733 | } |
619 | 734 | ||
620 | void lguest_pmd_clear(pmd_t *pmdp) | 735 | static void lguest_pmd_clear(pmd_t *pmdp) |
621 | { | 736 | { |
622 | lguest_set_pmd(pmdp, __pmd(0)); | 737 | lguest_set_pmd(pmdp, __pmd(0)); |
623 | } | 738 | } |
624 | #endif | 739 | #endif |
625 | 740 | ||
626 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 741 | /* |
742 | * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | ||
627 | * native page table operations. On native hardware you can set a new page | 743 | * native page table operations. On native hardware you can set a new page |
628 | * table entry whenever you want, but if you want to remove one you have to do | 744 | * table entry whenever you want, but if you want to remove one you have to do |
629 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). | 745 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). |
@@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp) | |||
632 | * called when a valid entry is written, not when it's removed (ie. marked not | 748 | * called when a valid entry is written, not when it's removed (ie. marked not |
633 | * present). Instead, this is where we come when the Guest wants to remove a | 749 | * present). Instead, this is where we come when the Guest wants to remove a |
634 | * page table entry: we tell the Host to set that entry to 0 (ie. the present | 750 | * page table entry: we tell the Host to set that entry to 0 (ie. the present |
635 | * bit is zero). */ | 751 | * bit is zero). |
752 | */ | ||
636 | static void lguest_flush_tlb_single(unsigned long addr) | 753 | static void lguest_flush_tlb_single(unsigned long addr) |
637 | { | 754 | { |
638 | /* Simply set it to zero: if it was not, it will fault back in. */ | 755 | /* Simply set it to zero: if it was not, it will fault back in. */ |
639 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); | 756 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); |
640 | } | 757 | } |
641 | 758 | ||
642 | /* This is what happens after the Guest has removed a large number of entries. | 759 | /* |
760 | * This is what happens after the Guest has removed a large number of entries. | ||
643 | * This tells the Host that any of the page table entries for userspace might | 761 | * This tells the Host that any of the page table entries for userspace might |
644 | * have changed, ie. virtual addresses below PAGE_OFFSET. */ | 762 | * have changed, ie. virtual addresses below PAGE_OFFSET. |
763 | */ | ||
645 | static void lguest_flush_tlb_user(void) | 764 | static void lguest_flush_tlb_user(void) |
646 | { | 765 | { |
647 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); | 766 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); |
648 | } | 767 | } |
649 | 768 | ||
650 | /* This is called when the kernel page tables have changed. That's not very | 769 | /* |
770 | * This is called when the kernel page tables have changed. That's not very | ||
651 | * common (unless the Guest is using highmem, which makes the Guest extremely | 771 | * common (unless the Guest is using highmem, which makes the Guest extremely |
652 | * slow), so it's worth separating this from the user flushing above. */ | 772 | * slow), so it's worth separating this from the user flushing above. |
773 | */ | ||
653 | static void lguest_flush_tlb_kernel(void) | 774 | static void lguest_flush_tlb_kernel(void) |
654 | { | 775 | { |
655 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 776 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
@@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = { | |||
686 | .unmask = enable_lguest_irq, | 807 | .unmask = enable_lguest_irq, |
687 | }; | 808 | }; |
688 | 809 | ||
689 | /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | 810 | /* |
811 | * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | ||
690 | * interrupt (except 128, which is used for system calls), and then tells the | 812 | * interrupt (except 128, which is used for system calls), and then tells the |
691 | * Linux infrastructure that each interrupt is controlled by our level-based | 813 | * Linux infrastructure that each interrupt is controlled by our level-based |
692 | * lguest interrupt controller. */ | 814 | * lguest interrupt controller. |
815 | */ | ||
693 | static void __init lguest_init_IRQ(void) | 816 | static void __init lguest_init_IRQ(void) |
694 | { | 817 | { |
695 | unsigned int i; | 818 | unsigned int i; |
696 | 819 | ||
697 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 820 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
698 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 821 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
699 | * a straightforward 1 to 1 mapping, so force that here. */ | ||
700 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; | 822 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
701 | if (i != SYSCALL_VECTOR) | 823 | if (i != SYSCALL_VECTOR) |
702 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 824 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
703 | } | 825 | } |
704 | /* This call is required to set up for 4k stacks, where we have | 826 | |
705 | * separate stacks for hard and soft interrupts. */ | 827 | /* |
828 | * This call is required to set up for 4k stacks, where we have | ||
829 | * separate stacks for hard and soft interrupts. | ||
830 | */ | ||
706 | irq_ctx_init(smp_processor_id()); | 831 | irq_ctx_init(smp_processor_id()); |
707 | } | 832 | } |
708 | 833 | ||
834 | /* | ||
835 | * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so | ||
836 | * rather than set them in lguest_init_IRQ we are called here every time an | ||
837 | * lguest device needs an interrupt. | ||
838 | * | ||
839 | * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should | ||
840 | * pass that up! | ||
841 | */ | ||
709 | void lguest_setup_irq(unsigned int irq) | 842 | void lguest_setup_irq(unsigned int irq) |
710 | { | 843 | { |
711 | irq_to_desc_alloc_node(irq, 0); | 844 | irq_to_desc_alloc_node(irq, 0); |
@@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void) | |||
724 | return lguest_data.time.tv_sec; | 857 | return lguest_data.time.tv_sec; |
725 | } | 858 | } |
726 | 859 | ||
727 | /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | 860 | /* |
861 | * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | ||
728 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. | 862 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. |
729 | * This matches what we want here: if we return 0 from this function, the x86 | 863 | * This matches what we want here: if we return 0 from this function, the x86 |
730 | * TSC clock will give up and not register itself. */ | 864 | * TSC clock will give up and not register itself. |
865 | */ | ||
731 | static unsigned long lguest_tsc_khz(void) | 866 | static unsigned long lguest_tsc_khz(void) |
732 | { | 867 | { |
733 | return lguest_data.tsc_khz; | 868 | return lguest_data.tsc_khz; |
734 | } | 869 | } |
735 | 870 | ||
736 | /* If we can't use the TSC, the kernel falls back to our lower-priority | 871 | /* |
737 | * "lguest_clock", where we read the time value given to us by the Host. */ | 872 | * If we can't use the TSC, the kernel falls back to our lower-priority |
873 | * "lguest_clock", where we read the time value given to us by the Host. | ||
874 | */ | ||
738 | static cycle_t lguest_clock_read(struct clocksource *cs) | 875 | static cycle_t lguest_clock_read(struct clocksource *cs) |
739 | { | 876 | { |
740 | unsigned long sec, nsec; | 877 | unsigned long sec, nsec; |
741 | 878 | ||
742 | /* Since the time is in two parts (seconds and nanoseconds), we risk | 879 | /* |
880 | * Since the time is in two parts (seconds and nanoseconds), we risk | ||
743 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, | 881 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, |
744 | * and getting 99 and 0. As Linux tends to come apart under the stress | 882 | * and getting 99 and 0. As Linux tends to come apart under the stress |
745 | * of time travel, we must be careful: */ | 883 | * of time travel, we must be careful: |
884 | */ | ||
746 | do { | 885 | do { |
747 | /* First we read the seconds part. */ | 886 | /* First we read the seconds part. */ |
748 | sec = lguest_data.time.tv_sec; | 887 | sec = lguest_data.time.tv_sec; |
749 | /* This read memory barrier tells the compiler and the CPU that | 888 | /* |
889 | * This read memory barrier tells the compiler and the CPU that | ||
750 | * this can't be reordered: we have to complete the above | 890 | * this can't be reordered: we have to complete the above |
751 | * before going on. */ | 891 | * before going on. |
892 | */ | ||
752 | rmb(); | 893 | rmb(); |
753 | /* Now we read the nanoseconds part. */ | 894 | /* Now we read the nanoseconds part. */ |
754 | nsec = lguest_data.time.tv_nsec; | 895 | nsec = lguest_data.time.tv_nsec; |
@@ -772,9 +913,11 @@ static struct clocksource lguest_clock = { | |||
772 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 913 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
773 | }; | 914 | }; |
774 | 915 | ||
775 | /* We also need a "struct clock_event_device": Linux asks us to set it to go | 916 | /* |
917 | * We also need a "struct clock_event_device": Linux asks us to set it to go | ||
776 | * off some time in the future. Actually, James Morris figured all this out, I | 918 | * off some time in the future. Actually, James Morris figured all this out, I |
777 | * just applied the patch. */ | 919 | * just applied the patch. |
920 | */ | ||
778 | static int lguest_clockevent_set_next_event(unsigned long delta, | 921 | static int lguest_clockevent_set_next_event(unsigned long delta, |
779 | struct clock_event_device *evt) | 922 | struct clock_event_device *evt) |
780 | { | 923 | { |
@@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = { | |||
824 | .max_delta_ns = LG_CLOCK_MAX_DELTA, | 967 | .max_delta_ns = LG_CLOCK_MAX_DELTA, |
825 | }; | 968 | }; |
826 | 969 | ||
827 | /* This is the Guest timer interrupt handler (hardware interrupt 0). We just | 970 | /* |
828 | * call the clockevent infrastructure and it does whatever needs doing. */ | 971 | * This is the Guest timer interrupt handler (hardware interrupt 0). We just |
972 | * call the clockevent infrastructure and it does whatever needs doing. | ||
973 | */ | ||
829 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | 974 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) |
830 | { | 975 | { |
831 | unsigned long flags; | 976 | unsigned long flags; |
@@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | |||
836 | local_irq_restore(flags); | 981 | local_irq_restore(flags); |
837 | } | 982 | } |
838 | 983 | ||
839 | /* At some point in the boot process, we get asked to set up our timing | 984 | /* |
985 | * At some point in the boot process, we get asked to set up our timing | ||
840 | * infrastructure. The kernel doesn't expect timer interrupts before this, but | 986 | * infrastructure. The kernel doesn't expect timer interrupts before this, but |
841 | * we cleverly initialized the "blocked_interrupts" field of "struct | 987 | * we cleverly initialized the "blocked_interrupts" field of "struct |
842 | * lguest_data" so that timer interrupts were blocked until now. */ | 988 | * lguest_data" so that timer interrupts were blocked until now. |
989 | */ | ||
843 | static void lguest_time_init(void) | 990 | static void lguest_time_init(void) |
844 | { | 991 | { |
845 | /* Set up the timer interrupt (0) to go to our simple timer routine */ | 992 | /* Set up the timer interrupt (0) to go to our simple timer routine */ |
@@ -863,14 +1010,16 @@ static void lguest_time_init(void) | |||
863 | * to work. They're pretty simple. | 1010 | * to work. They're pretty simple. |
864 | */ | 1011 | */ |
865 | 1012 | ||
866 | /* The Guest needs to tell the Host what stack it expects traps to use. For | 1013 | /* |
1014 | * The Guest needs to tell the Host what stack it expects traps to use. For | ||
867 | * native hardware, this is part of the Task State Segment mentioned above in | 1015 | * native hardware, this is part of the Task State Segment mentioned above in |
868 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. | 1016 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. |
869 | * | 1017 | * |
870 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data | 1018 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data |
871 | * segment), the privilege level (we're privilege level 1, the Host is 0 and | 1019 | * segment), the privilege level (we're privilege level 1, the Host is 0 and |
872 | * will not tolerate us trying to use that), the stack pointer, and the number | 1020 | * will not tolerate us trying to use that), the stack pointer, and the number |
873 | * of pages in the stack. */ | 1021 | * of pages in the stack. |
1022 | */ | ||
874 | static void lguest_load_sp0(struct tss_struct *tss, | 1023 | static void lguest_load_sp0(struct tss_struct *tss, |
875 | struct thread_struct *thread) | 1024 | struct thread_struct *thread) |
876 | { | 1025 | { |
@@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) | |||
884 | /* FIXME: Implement */ | 1033 | /* FIXME: Implement */ |
885 | } | 1034 | } |
886 | 1035 | ||
887 | /* There are times when the kernel wants to make sure that no memory writes are | 1036 | /* |
1037 | * There are times when the kernel wants to make sure that no memory writes are | ||
888 | * caught in the cache (that they've all reached real hardware devices). This | 1038 | * caught in the cache (that they've all reached real hardware devices). This |
889 | * doesn't matter for the Guest which has virtual hardware. | 1039 | * doesn't matter for the Guest which has virtual hardware. |
890 | * | 1040 | * |
@@ -898,11 +1048,13 @@ static void lguest_wbinvd(void) | |||
898 | { | 1048 | { |
899 | } | 1049 | } |
900 | 1050 | ||
901 | /* If the Guest expects to have an Advanced Programmable Interrupt Controller, | 1051 | /* |
1052 | * If the Guest expects to have an Advanced Programmable Interrupt Controller, | ||
902 | * we play dumb by ignoring writes and returning 0 for reads. So it's no | 1053 | * we play dumb by ignoring writes and returning 0 for reads. So it's no |
903 | * longer Programmable nor Controlling anything, and I don't think 8 lines of | 1054 | * longer Programmable nor Controlling anything, and I don't think 8 lines of |
904 | * code qualifies for Advanced. It will also never interrupt anything. It | 1055 | * code qualifies for Advanced. It will also never interrupt anything. It |
905 | * does, however, allow us to get through the Linux boot code. */ | 1056 | * does, however, allow us to get through the Linux boot code. |
1057 | */ | ||
906 | #ifdef CONFIG_X86_LOCAL_APIC | 1058 | #ifdef CONFIG_X86_LOCAL_APIC |
907 | static void lguest_apic_write(u32 reg, u32 v) | 1059 | static void lguest_apic_write(u32 reg, u32 v) |
908 | { | 1060 | { |
@@ -951,11 +1103,13 @@ static void lguest_safe_halt(void) | |||
951 | kvm_hypercall0(LHCALL_HALT); | 1103 | kvm_hypercall0(LHCALL_HALT); |
952 | } | 1104 | } |
953 | 1105 | ||
954 | /* The SHUTDOWN hypercall takes a string to describe what's happening, and | 1106 | /* |
1107 | * The SHUTDOWN hypercall takes a string to describe what's happening, and | ||
955 | * an argument which says whether this to restart (reboot) the Guest or not. | 1108 | * an argument which says whether this to restart (reboot) the Guest or not. |
956 | * | 1109 | * |
957 | * Note that the Host always prefers that the Guest speak in physical addresses | 1110 | * Note that the Host always prefers that the Guest speak in physical addresses |
958 | * rather than virtual addresses, so we use __pa() here. */ | 1111 | * rather than virtual addresses, so we use __pa() here. |
1112 | */ | ||
959 | static void lguest_power_off(void) | 1113 | static void lguest_power_off(void) |
960 | { | 1114 | { |
961 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), | 1115 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), |
@@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void) | |||
986 | * nice to move it back to lguest_init. Patch welcome... */ | 1140 | * nice to move it back to lguest_init. Patch welcome... */ |
987 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | 1141 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); |
988 | 1142 | ||
989 | /* The Linux bootloader header contains an "e820" memory map: the | 1143 | /* |
990 | * Launcher populated the first entry with our memory limit. */ | 1144 | *The Linux bootloader header contains an "e820" memory map: the |
1145 | * Launcher populated the first entry with our memory limit. | ||
1146 | */ | ||
991 | e820_add_region(boot_params.e820_map[0].addr, | 1147 | e820_add_region(boot_params.e820_map[0].addr, |
992 | boot_params.e820_map[0].size, | 1148 | boot_params.e820_map[0].size, |
993 | boot_params.e820_map[0].type); | 1149 | boot_params.e820_map[0].type); |
@@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void) | |||
996 | return "LGUEST"; | 1152 | return "LGUEST"; |
997 | } | 1153 | } |
998 | 1154 | ||
999 | /* We will eventually use the virtio console device to produce console output, | 1155 | /* |
1156 | * We will eventually use the virtio console device to produce console output, | ||
1000 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce | 1157 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce |
1001 | * console output. */ | 1158 | * console output. |
1159 | */ | ||
1002 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) | 1160 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) |
1003 | { | 1161 | { |
1004 | char scratch[17]; | 1162 | char scratch[17]; |
1005 | unsigned int len = count; | 1163 | unsigned int len = count; |
1006 | 1164 | ||
1007 | /* We use a nul-terminated string, so we have to make a copy. Icky, | 1165 | /* We use a nul-terminated string, so we make a copy. Icky, huh? */ |
1008 | * huh? */ | ||
1009 | if (len > sizeof(scratch) - 1) | 1166 | if (len > sizeof(scratch) - 1) |
1010 | len = sizeof(scratch) - 1; | 1167 | len = sizeof(scratch) - 1; |
1011 | scratch[len] = '\0'; | 1168 | scratch[len] = '\0'; |
@@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) | |||
1016 | return len; | 1173 | return len; |
1017 | } | 1174 | } |
1018 | 1175 | ||
1019 | /* Rebooting also tells the Host we're finished, but the RESTART flag tells the | 1176 | /* |
1020 | * Launcher to reboot us. */ | 1177 | * Rebooting also tells the Host we're finished, but the RESTART flag tells the |
1178 | * Launcher to reboot us. | ||
1179 | */ | ||
1021 | static void lguest_restart(char *reason) | 1180 | static void lguest_restart(char *reason) |
1022 | { | 1181 | { |
1023 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); | 1182 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); |
@@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason) | |||
1044 | * fit comfortably. | 1203 | * fit comfortably. |
1045 | * | 1204 | * |
1046 | * First we need assembly templates of each of the patchable Guest operations, | 1205 | * First we need assembly templates of each of the patchable Guest operations, |
1047 | * and these are in i386_head.S. */ | 1206 | * and these are in i386_head.S. |
1207 | */ | ||
1048 | 1208 | ||
1049 | /*G:060 We construct a table from the assembler templates: */ | 1209 | /*G:060 We construct a table from the assembler templates: */ |
1050 | static const struct lguest_insns | 1210 | static const struct lguest_insns |
@@ -1055,9 +1215,11 @@ static const struct lguest_insns | |||
1055 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1215 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
1056 | }; | 1216 | }; |
1057 | 1217 | ||
1058 | /* Now our patch routine is fairly simple (based on the native one in | 1218 | /* |
1219 | * Now our patch routine is fairly simple (based on the native one in | ||
1059 | * paravirt.c). If we have a replacement, we copy it in and return how much of | 1220 | * paravirt.c). If we have a replacement, we copy it in and return how much of |
1060 | * the available space we used. */ | 1221 | * the available space we used. |
1222 | */ | ||
1061 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | 1223 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, |
1062 | unsigned long addr, unsigned len) | 1224 | unsigned long addr, unsigned len) |
1063 | { | 1225 | { |
@@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1069 | 1231 | ||
1070 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | 1232 | insn_len = lguest_insns[type].end - lguest_insns[type].start; |
1071 | 1233 | ||
1072 | /* Similarly if we can't fit replacement (shouldn't happen, but let's | 1234 | /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ |
1073 | * be thorough). */ | ||
1074 | if (len < insn_len) | 1235 | if (len < insn_len) |
1075 | return paravirt_patch_default(type, clobber, ibuf, addr, len); | 1236 | return paravirt_patch_default(type, clobber, ibuf, addr, len); |
1076 | 1237 | ||
@@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1079 | return insn_len; | 1240 | return insn_len; |
1080 | } | 1241 | } |
1081 | 1242 | ||
1082 | /*G:030 Once we get to lguest_init(), we know we're a Guest. The various | 1243 | /*G:029 |
1244 | * Once we get to lguest_init(), we know we're a Guest. The various | ||
1083 | * pv_ops structures in the kernel provide points for (almost) every routine we | 1245 | * pv_ops structures in the kernel provide points for (almost) every routine we |
1084 | * have to override to avoid privileged instructions. */ | 1246 | * have to override to avoid privileged instructions. |
1247 | */ | ||
1085 | __init void lguest_init(void) | 1248 | __init void lguest_init(void) |
1086 | { | 1249 | { |
1087 | /* We're under lguest, paravirt is enabled, and we're running at | 1250 | /* We're under lguest. */ |
1088 | * privilege level 1, not 0 as normal. */ | ||
1089 | pv_info.name = "lguest"; | 1251 | pv_info.name = "lguest"; |
1252 | /* Paravirt is enabled. */ | ||
1090 | pv_info.paravirt_enabled = 1; | 1253 | pv_info.paravirt_enabled = 1; |
1254 | /* We're running at privilege level 1, not 0 as normal. */ | ||
1091 | pv_info.kernel_rpl = 1; | 1255 | pv_info.kernel_rpl = 1; |
1256 | /* Everyone except Xen runs with this set. */ | ||
1092 | pv_info.shared_kernel_pmd = 1; | 1257 | pv_info.shared_kernel_pmd = 1; |
1093 | 1258 | ||
1094 | /* We set up all the lguest overrides for sensitive operations. These | 1259 | /* |
1095 | * are detailed with the operations themselves. */ | 1260 | * We set up all the lguest overrides for sensitive operations. These |
1261 | * are detailed with the operations themselves. | ||
1262 | */ | ||
1096 | 1263 | ||
1097 | /* interrupt-related operations */ | 1264 | /* Interrupt-related operations */ |
1098 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1265 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
1099 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1266 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1100 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); | 1267 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
@@ -1102,11 +1269,11 @@ __init void lguest_init(void) | |||
1102 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); | 1269 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
1103 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1270 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1104 | 1271 | ||
1105 | /* init-time operations */ | 1272 | /* Setup operations */ |
1106 | pv_init_ops.memory_setup = lguest_memory_setup; | 1273 | pv_init_ops.memory_setup = lguest_memory_setup; |
1107 | pv_init_ops.patch = lguest_patch; | 1274 | pv_init_ops.patch = lguest_patch; |
1108 | 1275 | ||
1109 | /* Intercepts of various cpu instructions */ | 1276 | /* Intercepts of various CPU instructions */ |
1110 | pv_cpu_ops.load_gdt = lguest_load_gdt; | 1277 | pv_cpu_ops.load_gdt = lguest_load_gdt; |
1111 | pv_cpu_ops.cpuid = lguest_cpuid; | 1278 | pv_cpu_ops.cpuid = lguest_cpuid; |
1112 | pv_cpu_ops.load_idt = lguest_load_idt; | 1279 | pv_cpu_ops.load_idt = lguest_load_idt; |
@@ -1127,7 +1294,7 @@ __init void lguest_init(void) | |||
1127 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; | 1294 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; |
1128 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; | 1295 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; |
1129 | 1296 | ||
1130 | /* pagetable management */ | 1297 | /* Pagetable management */ |
1131 | pv_mmu_ops.write_cr3 = lguest_write_cr3; | 1298 | pv_mmu_ops.write_cr3 = lguest_write_cr3; |
1132 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; | 1299 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; |
1133 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; | 1300 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; |
@@ -1149,54 +1316,71 @@ __init void lguest_init(void) | |||
1149 | pv_mmu_ops.pte_update_defer = lguest_pte_update; | 1316 | pv_mmu_ops.pte_update_defer = lguest_pte_update; |
1150 | 1317 | ||
1151 | #ifdef CONFIG_X86_LOCAL_APIC | 1318 | #ifdef CONFIG_X86_LOCAL_APIC |
1152 | /* apic read/write intercepts */ | 1319 | /* APIC read/write intercepts */ |
1153 | set_lguest_basic_apic_ops(); | 1320 | set_lguest_basic_apic_ops(); |
1154 | #endif | 1321 | #endif |
1155 | 1322 | ||
1156 | /* time operations */ | 1323 | /* Time operations */ |
1157 | pv_time_ops.get_wallclock = lguest_get_wallclock; | 1324 | pv_time_ops.get_wallclock = lguest_get_wallclock; |
1158 | pv_time_ops.time_init = lguest_time_init; | 1325 | pv_time_ops.time_init = lguest_time_init; |
1159 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; | 1326 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; |
1160 | 1327 | ||
1161 | /* Now is a good time to look at the implementations of these functions | 1328 | /* |
1162 | * before returning to the rest of lguest_init(). */ | 1329 | * Now is a good time to look at the implementations of these functions |
1330 | * before returning to the rest of lguest_init(). | ||
1331 | */ | ||
1163 | 1332 | ||
1164 | /*G:070 Now we've seen all the paravirt_ops, we return to | 1333 | /*G:070 |
1334 | * Now we've seen all the paravirt_ops, we return to | ||
1165 | * lguest_init() where the rest of the fairly chaotic boot setup | 1335 | * lguest_init() where the rest of the fairly chaotic boot setup |
1166 | * occurs. */ | 1336 | * occurs. |
1337 | */ | ||
1167 | 1338 | ||
1168 | /* The stack protector is a weird thing where gcc places a canary | 1339 | /* |
1340 | * The stack protector is a weird thing where gcc places a canary | ||
1169 | * value on the stack and then checks it on return. This file is | 1341 | * value on the stack and then checks it on return. This file is |
1170 | * compiled with -fno-stack-protector it, so we got this far without | 1342 | * compiled with -fno-stack-protector it, so we got this far without |
1171 | * problems. The value of the canary is kept at offset 20 from the | 1343 | * problems. The value of the canary is kept at offset 20 from the |
1172 | * %gs register, so we need to set that up before calling C functions | 1344 | * %gs register, so we need to set that up before calling C functions |
1173 | * in other files. */ | 1345 | * in other files. |
1346 | */ | ||
1174 | setup_stack_canary_segment(0); | 1347 | setup_stack_canary_segment(0); |
1175 | /* We could just call load_stack_canary_segment(), but we might as | 1348 | |
1176 | * call switch_to_new_gdt() which loads the whole table and sets up | 1349 | /* |
1177 | * the per-cpu segment descriptor register %fs as well. */ | 1350 | * We could just call load_stack_canary_segment(), but we might as well |
1351 | * call switch_to_new_gdt() which loads the whole table and sets up the | ||
1352 | * per-cpu segment descriptor register %fs as well. | ||
1353 | */ | ||
1178 | switch_to_new_gdt(0); | 1354 | switch_to_new_gdt(0); |
1179 | 1355 | ||
1180 | /* As described in head_32.S, we map the first 128M of memory. */ | 1356 | /* We actually boot with all memory mapped, but let's say 128MB. */ |
1181 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1357 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1182 | 1358 | ||
1183 | /* The Host<->Guest Switcher lives at the top of our address space, and | 1359 | /* |
1360 | * The Host<->Guest Switcher lives at the top of our address space, and | ||
1184 | * the Host told us how big it is when we made LGUEST_INIT hypercall: | 1361 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
1185 | * it put the answer in lguest_data.reserve_mem */ | 1362 | * it put the answer in lguest_data.reserve_mem |
1363 | */ | ||
1186 | reserve_top_address(lguest_data.reserve_mem); | 1364 | reserve_top_address(lguest_data.reserve_mem); |
1187 | 1365 | ||
1188 | /* If we don't initialize the lock dependency checker now, it crashes | 1366 | /* |
1189 | * paravirt_disable_iospace. */ | 1367 | * If we don't initialize the lock dependency checker now, it crashes |
1368 | * paravirt_disable_iospace. | ||
1369 | */ | ||
1190 | lockdep_init(); | 1370 | lockdep_init(); |
1191 | 1371 | ||
1192 | /* The IDE code spends about 3 seconds probing for disks: if we reserve | 1372 | /* |
1373 | * The IDE code spends about 3 seconds probing for disks: if we reserve | ||
1193 | * all the I/O ports up front it can't get them and so doesn't probe. | 1374 | * all the I/O ports up front it can't get them and so doesn't probe. |
1194 | * Other device drivers are similar (but less severe). This cuts the | 1375 | * Other device drivers are similar (but less severe). This cuts the |
1195 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ | 1376 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. |
1377 | */ | ||
1196 | paravirt_disable_iospace(); | 1378 | paravirt_disable_iospace(); |
1197 | 1379 | ||
1198 | /* This is messy CPU setup stuff which the native boot code does before | 1380 | /* |
1199 | * start_kernel, so we have to do, too: */ | 1381 | * This is messy CPU setup stuff which the native boot code does before |
1382 | * start_kernel, so we have to do, too: | ||
1383 | */ | ||
1200 | cpu_detect(&new_cpu_data); | 1384 | cpu_detect(&new_cpu_data); |
1201 | /* head.S usually sets up the first capability word, so do it here. */ | 1385 | /* head.S usually sets up the first capability word, so do it here. */ |
1202 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1386 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
@@ -1213,22 +1397,28 @@ __init void lguest_init(void) | |||
1213 | acpi_ht = 0; | 1397 | acpi_ht = 0; |
1214 | #endif | 1398 | #endif |
1215 | 1399 | ||
1216 | /* We set the preferred console to "hvc". This is the "hypervisor | 1400 | /* |
1401 | * We set the preferred console to "hvc". This is the "hypervisor | ||
1217 | * virtual console" driver written by the PowerPC people, which we also | 1402 | * virtual console" driver written by the PowerPC people, which we also |
1218 | * adapted for lguest's use. */ | 1403 | * adapted for lguest's use. |
1404 | */ | ||
1219 | add_preferred_console("hvc", 0, NULL); | 1405 | add_preferred_console("hvc", 0, NULL); |
1220 | 1406 | ||
1221 | /* Register our very early console. */ | 1407 | /* Register our very early console. */ |
1222 | virtio_cons_early_init(early_put_chars); | 1408 | virtio_cons_early_init(early_put_chars); |
1223 | 1409 | ||
1224 | /* Last of all, we set the power management poweroff hook to point to | 1410 | /* |
1411 | * Last of all, we set the power management poweroff hook to point to | ||
1225 | * the Guest routine to power off, and the reboot hook to our restart | 1412 | * the Guest routine to power off, and the reboot hook to our restart |
1226 | * routine. */ | 1413 | * routine. |
1414 | */ | ||
1227 | pm_power_off = lguest_power_off; | 1415 | pm_power_off = lguest_power_off; |
1228 | machine_ops.restart = lguest_restart; | 1416 | machine_ops.restart = lguest_restart; |
1229 | 1417 | ||
1230 | /* Now we're set up, call i386_start_kernel() in head32.c and we proceed | 1418 | /* |
1231 | * to boot as normal. It never returns. */ | 1419 | * Now we're set up, call i386_start_kernel() in head32.c and we proceed |
1420 | * to boot as normal. It never returns. | ||
1421 | */ | ||
1232 | i386_start_kernel(); | 1422 | i386_start_kernel(); |
1233 | } | 1423 | } |
1234 | /* | 1424 | /* |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe61cd4..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -5,7 +5,8 @@ | |||
5 | #include <asm/thread_info.h> | 5 | #include <asm/thread_info.h> |
6 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
7 | 7 | ||
8 | /*G:020 Our story starts with the kernel booting into startup_32 in | 8 | /*G:020 |
9 | * Our story starts with the kernel booting into startup_32 in | ||
9 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by | 10 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by |
10 | * the bootloader (the Launcher in our case). | 11 | * the bootloader (the Launcher in our case). |
11 | * | 12 | * |
@@ -21,11 +22,14 @@ | |||
21 | * data without remembering to subtract __PAGE_OFFSET! | 22 | * data without remembering to subtract __PAGE_OFFSET! |
22 | * | 23 | * |
23 | * The .section line puts this code in .init.text so it will be discarded after | 24 | * The .section line puts this code in .init.text so it will be discarded after |
24 | * boot. */ | 25 | * boot. |
26 | */ | ||
25 | .section .init.text, "ax", @progbits | 27 | .section .init.text, "ax", @progbits |
26 | ENTRY(lguest_entry) | 28 | ENTRY(lguest_entry) |
27 | /* We make the "initialization" hypercall now to tell the Host about | 29 | /* |
28 | * us, and also find out where it put our page tables. */ | 30 | * We make the "initialization" hypercall now to tell the Host about |
31 | * us, and also find out where it put our page tables. | ||
32 | */ | ||
29 | movl $LHCALL_LGUEST_INIT, %eax | 33 | movl $LHCALL_LGUEST_INIT, %eax |
30 | movl $lguest_data - __PAGE_OFFSET, %ebx | 34 | movl $lguest_data - __PAGE_OFFSET, %ebx |
31 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 35 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
@@ -33,13 +37,14 @@ ENTRY(lguest_entry) | |||
33 | /* Set up the initial stack so we can run C code. */ | 37 | /* Set up the initial stack so we can run C code. */ |
34 | movl $(init_thread_union+THREAD_SIZE),%esp | 38 | movl $(init_thread_union+THREAD_SIZE),%esp |
35 | 39 | ||
36 | /* Jumps are relative, and we're running __PAGE_OFFSET too low at the | 40 | /* Jumps are relative: we're running __PAGE_OFFSET too low. */ |
37 | * moment. */ | ||
38 | jmp lguest_init+__PAGE_OFFSET | 41 | jmp lguest_init+__PAGE_OFFSET |
39 | 42 | ||
40 | /*G:055 We create a macro which puts the assembler code between lgstart_ and | 43 | /*G:055 |
41 | * lgend_ markers. These templates are put in the .text section: they can't be | 44 | * We create a macro which puts the assembler code between lgstart_ and lgend_ |
42 | * discarded after boot as we may need to patch modules, too. */ | 45 | * markers. These templates are put in the .text section: they can't be |
46 | * discarded after boot as we may need to patch modules, too. | ||
47 | */ | ||
43 | .text | 48 | .text |
44 | #define LGUEST_PATCH(name, insns...) \ | 49 | #define LGUEST_PATCH(name, insns...) \ |
45 | lgstart_##name: insns; lgend_##name:; \ | 50 | lgstart_##name: insns; lgend_##name:; \ |
@@ -48,83 +53,103 @@ ENTRY(lguest_entry) | |||
48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 53 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 54 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
50 | 55 | ||
51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | 56 | /*G:033 |
52 | * matter for save_fl and irq_disable later). If we write our routines | 57 | * But using those wrappers is inefficient (we'll see why that doesn't matter |
53 | * carefully in assembler, we can avoid clobbering any registers and avoid | 58 | * for save_fl and irq_disable later). If we write our routines carefully in |
54 | * jumping through the wrapper functions. | 59 | * assembler, we can avoid clobbering any registers and avoid jumping through |
60 | * the wrapper functions. | ||
55 | * | 61 | * |
56 | * I skipped over our first piece of assembler, but this one is worth studying | 62 | * I skipped over our first piece of assembler, but this one is worth studying |
57 | * in a bit more detail so I'll describe in easy stages. First, the routine | 63 | * in a bit more detail so I'll describe in easy stages. First, the routine to |
58 | * to enable interrupts: */ | 64 | * enable interrupts: |
65 | */ | ||
59 | ENTRY(lg_irq_enable) | 66 | ENTRY(lg_irq_enable) |
60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | 67 | /* |
61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | 68 | * The reverse of irq_disable, this sets lguest_data.irq_enabled to |
69 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). | ||
70 | */ | ||
62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | 71 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled |
63 | /* But now we need to check if the Host wants to know: there might have | 72 | /* |
73 | * But now we need to check if the Host wants to know: there might have | ||
64 | * been interrupts waiting to be delivered, in which case it will have | 74 | * been interrupts waiting to be delivered, in which case it will have |
65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | 75 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we |
66 | * jump to send_interrupts, otherwise we're done. */ | 76 | * jump to send_interrupts, otherwise we're done. |
77 | */ | ||
67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | 78 | testl $0, lguest_data+LGUEST_DATA_irq_pending |
68 | jnz send_interrupts | 79 | jnz send_interrupts |
69 | /* One cool thing about x86 is that you can do many things without using | 80 | /* |
81 | * One cool thing about x86 is that you can do many things without using | ||
70 | * a register. In this case, the normal path hasn't needed to save or | 82 | * a register. In this case, the normal path hasn't needed to save or |
71 | * restore any registers at all! */ | 83 | * restore any registers at all! |
84 | */ | ||
72 | ret | 85 | ret |
73 | send_interrupts: | 86 | send_interrupts: |
74 | /* OK, now we need a register: eax is used for the hypercall number, | 87 | /* |
88 | * OK, now we need a register: eax is used for the hypercall number, | ||
75 | * which is LHCALL_SEND_INTERRUPTS. | 89 | * which is LHCALL_SEND_INTERRUPTS. |
76 | * | 90 | * |
77 | * We used not to bother with this pending detection at all, which was | 91 | * We used not to bother with this pending detection at all, which was |
78 | * much simpler. Sooner or later the Host would realize it had to | 92 | * much simpler. Sooner or later the Host would realize it had to |
79 | * send us an interrupt. But that turns out to make performance 7 | 93 | * send us an interrupt. But that turns out to make performance 7 |
80 | * times worse on a simple tcp benchmark. So now we do this the hard | 94 | * times worse on a simple tcp benchmark. So now we do this the hard |
81 | * way. */ | 95 | * way. |
96 | */ | ||
82 | pushl %eax | 97 | pushl %eax |
83 | movl $LHCALL_SEND_INTERRUPTS, %eax | 98 | movl $LHCALL_SEND_INTERRUPTS, %eax |
84 | /* This is a vmcall instruction (same thing that KVM uses). Older | 99 | /* |
100 | * This is a vmcall instruction (same thing that KVM uses). Older | ||
85 | * assembler versions might not know the "vmcall" instruction, so we | 101 | * assembler versions might not know the "vmcall" instruction, so we |
86 | * create one manually here. */ | 102 | * create one manually here. |
103 | */ | ||
87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
105 | /* Put eax back the way we found it. */ | ||
88 | popl %eax | 106 | popl %eax |
89 | ret | 107 | ret |
90 | 108 | ||
91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | 109 | /* |
110 | * Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | 111 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're |
93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | 112 | * enabling interrupts again, if it's 0 we're leaving them off. |
113 | */ | ||
94 | ENTRY(lg_restore_fl) | 114 | ENTRY(lg_restore_fl) |
95 | /* This is just "lguest_data.irq_enabled = flags;" */ | 115 | /* This is just "lguest_data.irq_enabled = flags;" */ |
96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | 116 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled |
97 | /* Now, if the %eax value has enabled interrupts and | 117 | /* |
118 | * Now, if the %eax value has enabled interrupts and | ||
98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | 119 | * lguest_data.irq_pending is set, we want to tell the Host so it can |
99 | * deliver any outstanding interrupts. Fortunately, both values will | 120 | * deliver any outstanding interrupts. Fortunately, both values will |
100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | 121 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" |
101 | * instruction will AND them together for us. If both are set, we | 122 | * instruction will AND them together for us. If both are set, we |
102 | * jump to send_interrupts. */ | 123 | * jump to send_interrupts. |
124 | */ | ||
103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | 125 | testl lguest_data+LGUEST_DATA_irq_pending, %eax |
104 | jnz send_interrupts | 126 | jnz send_interrupts |
105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | 127 | /* Again, the normal path has used no extra registers. Clever, huh? */ |
106 | ret | 128 | ret |
129 | /*:*/ | ||
107 | 130 | ||
108 | /* These demark the EIP range where host should never deliver interrupts. */ | 131 | /* These demark the EIP range where host should never deliver interrupts. */ |
109 | .global lguest_noirq_start | 132 | .global lguest_noirq_start |
110 | .global lguest_noirq_end | 133 | .global lguest_noirq_end |
111 | 134 | ||
112 | /*M:004 When the Host reflects a trap or injects an interrupt into the Guest, | 135 | /*M:004 |
113 | * it sets the eflags interrupt bit on the stack based on | 136 | * When the Host reflects a trap or injects an interrupt into the Guest, it |
114 | * lguest_data.irq_enabled, so the Guest iret logic does the right thing when | 137 | * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, |
115 | * restoring it. However, when the Host sets the Guest up for direct traps, | 138 | * so the Guest iret logic does the right thing when restoring it. However, |
116 | * such as system calls, the processor is the one to push eflags onto the | 139 | * when the Host sets the Guest up for direct traps, such as system calls, the |
117 | * stack, and the interrupt bit will be 1 (in reality, interrupts are always | 140 | * processor is the one to push eflags onto the stack, and the interrupt bit |
118 | * enabled in the Guest). | 141 | * will be 1 (in reality, interrupts are always enabled in the Guest). |
119 | * | 142 | * |
120 | * This turns out to be harmless: the only trap which should happen under Linux | 143 | * This turns out to be harmless: the only trap which should happen under Linux |
121 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc | 144 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc |
122 | * regions), which has to be reflected through the Host anyway. If another | 145 | * regions), which has to be reflected through the Host anyway. If another |
123 | * trap *does* go off when interrupts are disabled, the Guest will panic, and | 146 | * trap *does* go off when interrupts are disabled, the Guest will panic, and |
124 | * we'll never get to this iret! :*/ | 147 | * we'll never get to this iret! |
148 | :*/ | ||
125 | 149 | ||
126 | /*G:045 There is one final paravirt_op that the Guest implements, and glancing | 150 | /*G:045 |
127 | * at it you can see why I left it to last. It's *cool*! It's in *assembler*! | 151 | * There is one final paravirt_op that the Guest implements, and glancing at it |
152 | * you can see why I left it to last. It's *cool*! It's in *assembler*! | ||
128 | * | 153 | * |
129 | * The "iret" instruction is used to return from an interrupt or trap. The | 154 | * The "iret" instruction is used to return from an interrupt or trap. The |
130 | * stack looks like this: | 155 | * stack looks like this: |
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl) | |||
148 | * return to userspace or wherever. Our solution to this is to surround the | 173 | * return to userspace or wherever. Our solution to this is to surround the |
149 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the | 174 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the |
150 | * Host that it is *never* to interrupt us there, even if interrupts seem to be | 175 | * Host that it is *never* to interrupt us there, even if interrupts seem to be |
151 | * enabled. */ | 176 | * enabled. |
177 | */ | ||
152 | ENTRY(lguest_iret) | 178 | ENTRY(lguest_iret) |
153 | pushl %eax | 179 | pushl %eax |
154 | movl 12(%esp), %eax | 180 | movl 12(%esp), %eax |
155 | lguest_noirq_start: | 181 | lguest_noirq_start: |
156 | /* Note the %ss: segment prefix here. Normal data accesses use the | 182 | /* |
183 | * Note the %ss: segment prefix here. Normal data accesses use the | ||
157 | * "ds" segment, but that will have already been restored for whatever | 184 | * "ds" segment, but that will have already been restored for whatever |
158 | * we're returning to (such as userspace): we can't trust it. The %ss: | 185 | * we're returning to (such as userspace): we can't trust it. The %ss: |
159 | * prefix makes sure we use the stack segment, which is still valid. */ | 186 | * prefix makes sure we use the stack segment, which is still valid. |
187 | */ | ||
160 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled | 188 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled |
161 | popl %eax | 189 | popl %eax |
162 | iret | 190 | iret |
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f9d35632666b..07c31899c9c2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
@@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o | |||
10 | lib-y += memcpy_$(BITS).o | 10 | lib-y += memcpy_$(BITS).o |
11 | 11 | ||
12 | ifeq ($(CONFIG_X86_32),y) | 12 | ifeq ($(CONFIG_X86_32),y) |
13 | obj-y += atomic64_32.o | ||
13 | lib-y += checksum_32.o | 14 | lib-y += checksum_32.o |
14 | lib-y += strstr_32.o | 15 | lib-y += strstr_32.o |
15 | lib-y += semaphore_32.o string_32.o | 16 | lib-y += semaphore_32.o string_32.o |
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c new file mode 100644 index 000000000000..824fa0be55a3 --- /dev/null +++ b/arch/x86/lib/atomic64_32.c | |||
@@ -0,0 +1,230 @@ | |||
1 | #include <linux/compiler.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/types.h> | ||
4 | |||
5 | #include <asm/processor.h> | ||
6 | #include <asm/cmpxchg.h> | ||
7 | #include <asm/atomic.h> | ||
8 | |||
9 | static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new) | ||
10 | { | ||
11 | u32 low = new; | ||
12 | u32 high = new >> 32; | ||
13 | |||
14 | asm volatile( | ||
15 | LOCK_PREFIX "cmpxchg8b %1\n" | ||
16 | : "+A" (old), "+m" (*ptr) | ||
17 | : "b" (low), "c" (high) | ||
18 | ); | ||
19 | return old; | ||
20 | } | ||
21 | |||
22 | u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val) | ||
23 | { | ||
24 | return cmpxchg8b(&ptr->counter, old_val, new_val); | ||
25 | } | ||
26 | EXPORT_SYMBOL(atomic64_cmpxchg); | ||
27 | |||
28 | /** | ||
29 | * atomic64_xchg - xchg atomic64 variable | ||
30 | * @ptr: pointer to type atomic64_t | ||
31 | * @new_val: value to assign | ||
32 | * | ||
33 | * Atomically xchgs the value of @ptr to @new_val and returns | ||
34 | * the old value. | ||
35 | */ | ||
36 | u64 atomic64_xchg(atomic64_t *ptr, u64 new_val) | ||
37 | { | ||
38 | /* | ||
39 | * Try first with a (possibly incorrect) assumption about | ||
40 | * what we have there. We'll do two loops most likely, | ||
41 | * but we'll get an ownership MESI transaction straight away | ||
42 | * instead of a read transaction followed by a | ||
43 | * flush-for-ownership transaction: | ||
44 | */ | ||
45 | u64 old_val, real_val = 0; | ||
46 | |||
47 | do { | ||
48 | old_val = real_val; | ||
49 | |||
50 | real_val = atomic64_cmpxchg(ptr, old_val, new_val); | ||
51 | |||
52 | } while (real_val != old_val); | ||
53 | |||
54 | return old_val; | ||
55 | } | ||
56 | EXPORT_SYMBOL(atomic64_xchg); | ||
57 | |||
58 | /** | ||
59 | * atomic64_set - set atomic64 variable | ||
60 | * @ptr: pointer to type atomic64_t | ||
61 | * @new_val: value to assign | ||
62 | * | ||
63 | * Atomically sets the value of @ptr to @new_val. | ||
64 | */ | ||
65 | void atomic64_set(atomic64_t *ptr, u64 new_val) | ||
66 | { | ||
67 | atomic64_xchg(ptr, new_val); | ||
68 | } | ||
69 | EXPORT_SYMBOL(atomic64_set); | ||
70 | |||
71 | /** | ||
72 | EXPORT_SYMBOL(atomic64_read); | ||
73 | * atomic64_add_return - add and return | ||
74 | * @delta: integer value to add | ||
75 | * @ptr: pointer to type atomic64_t | ||
76 | * | ||
77 | * Atomically adds @delta to @ptr and returns @delta + *@ptr | ||
78 | */ | ||
79 | noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr) | ||
80 | { | ||
81 | /* | ||
82 | * Try first with a (possibly incorrect) assumption about | ||
83 | * what we have there. We'll do two loops most likely, | ||
84 | * but we'll get an ownership MESI transaction straight away | ||
85 | * instead of a read transaction followed by a | ||
86 | * flush-for-ownership transaction: | ||
87 | */ | ||
88 | u64 old_val, new_val, real_val = 0; | ||
89 | |||
90 | do { | ||
91 | old_val = real_val; | ||
92 | new_val = old_val + delta; | ||
93 | |||
94 | real_val = atomic64_cmpxchg(ptr, old_val, new_val); | ||
95 | |||
96 | } while (real_val != old_val); | ||
97 | |||
98 | return new_val; | ||
99 | } | ||
100 | EXPORT_SYMBOL(atomic64_add_return); | ||
101 | |||
102 | u64 atomic64_sub_return(u64 delta, atomic64_t *ptr) | ||
103 | { | ||
104 | return atomic64_add_return(-delta, ptr); | ||
105 | } | ||
106 | EXPORT_SYMBOL(atomic64_sub_return); | ||
107 | |||
108 | u64 atomic64_inc_return(atomic64_t *ptr) | ||
109 | { | ||
110 | return atomic64_add_return(1, ptr); | ||
111 | } | ||
112 | EXPORT_SYMBOL(atomic64_inc_return); | ||
113 | |||
114 | u64 atomic64_dec_return(atomic64_t *ptr) | ||
115 | { | ||
116 | return atomic64_sub_return(1, ptr); | ||
117 | } | ||
118 | EXPORT_SYMBOL(atomic64_dec_return); | ||
119 | |||
120 | /** | ||
121 | * atomic64_add - add integer to atomic64 variable | ||
122 | * @delta: integer value to add | ||
123 | * @ptr: pointer to type atomic64_t | ||
124 | * | ||
125 | * Atomically adds @delta to @ptr. | ||
126 | */ | ||
127 | void atomic64_add(u64 delta, atomic64_t *ptr) | ||
128 | { | ||
129 | atomic64_add_return(delta, ptr); | ||
130 | } | ||
131 | EXPORT_SYMBOL(atomic64_add); | ||
132 | |||
133 | /** | ||
134 | * atomic64_sub - subtract the atomic64 variable | ||
135 | * @delta: integer value to subtract | ||
136 | * @ptr: pointer to type atomic64_t | ||
137 | * | ||
138 | * Atomically subtracts @delta from @ptr. | ||
139 | */ | ||
140 | void atomic64_sub(u64 delta, atomic64_t *ptr) | ||
141 | { | ||
142 | atomic64_add(-delta, ptr); | ||
143 | } | ||
144 | EXPORT_SYMBOL(atomic64_sub); | ||
145 | |||
146 | /** | ||
147 | * atomic64_sub_and_test - subtract value from variable and test result | ||
148 | * @delta: integer value to subtract | ||
149 | * @ptr: pointer to type atomic64_t | ||
150 | * | ||
151 | * Atomically subtracts @delta from @ptr and returns | ||
152 | * true if the result is zero, or false for all | ||
153 | * other cases. | ||
154 | */ | ||
155 | int atomic64_sub_and_test(u64 delta, atomic64_t *ptr) | ||
156 | { | ||
157 | u64 new_val = atomic64_sub_return(delta, ptr); | ||
158 | |||
159 | return new_val == 0; | ||
160 | } | ||
161 | EXPORT_SYMBOL(atomic64_sub_and_test); | ||
162 | |||
163 | /** | ||
164 | * atomic64_inc - increment atomic64 variable | ||
165 | * @ptr: pointer to type atomic64_t | ||
166 | * | ||
167 | * Atomically increments @ptr by 1. | ||
168 | */ | ||
169 | void atomic64_inc(atomic64_t *ptr) | ||
170 | { | ||
171 | atomic64_add(1, ptr); | ||
172 | } | ||
173 | EXPORT_SYMBOL(atomic64_inc); | ||
174 | |||
175 | /** | ||
176 | * atomic64_dec - decrement atomic64 variable | ||
177 | * @ptr: pointer to type atomic64_t | ||
178 | * | ||
179 | * Atomically decrements @ptr by 1. | ||
180 | */ | ||
181 | void atomic64_dec(atomic64_t *ptr) | ||
182 | { | ||
183 | atomic64_sub(1, ptr); | ||
184 | } | ||
185 | EXPORT_SYMBOL(atomic64_dec); | ||
186 | |||
187 | /** | ||
188 | * atomic64_dec_and_test - decrement and test | ||
189 | * @ptr: pointer to type atomic64_t | ||
190 | * | ||
191 | * Atomically decrements @ptr by 1 and | ||
192 | * returns true if the result is 0, or false for all other | ||
193 | * cases. | ||
194 | */ | ||
195 | int atomic64_dec_and_test(atomic64_t *ptr) | ||
196 | { | ||
197 | return atomic64_sub_and_test(1, ptr); | ||
198 | } | ||
199 | EXPORT_SYMBOL(atomic64_dec_and_test); | ||
200 | |||
201 | /** | ||
202 | * atomic64_inc_and_test - increment and test | ||
203 | * @ptr: pointer to type atomic64_t | ||
204 | * | ||
205 | * Atomically increments @ptr by 1 | ||
206 | * and returns true if the result is zero, or false for all | ||
207 | * other cases. | ||
208 | */ | ||
209 | int atomic64_inc_and_test(atomic64_t *ptr) | ||
210 | { | ||
211 | return atomic64_sub_and_test(-1, ptr); | ||
212 | } | ||
213 | EXPORT_SYMBOL(atomic64_inc_and_test); | ||
214 | |||
215 | /** | ||
216 | * atomic64_add_negative - add and test if negative | ||
217 | * @delta: integer value to add | ||
218 | * @ptr: pointer to type atomic64_t | ||
219 | * | ||
220 | * Atomically adds @delta to @ptr and returns true | ||
221 | * if the result is negative, or false when | ||
222 | * result is greater than or equal to zero. | ||
223 | */ | ||
224 | int atomic64_add_negative(u64 delta, atomic64_t *ptr) | ||
225 | { | ||
226 | s64 new_val = atomic64_add_return(delta, ptr); | ||
227 | |||
228 | return new_val < 0; | ||
229 | } | ||
230 | EXPORT_SYMBOL(atomic64_add_negative); | ||
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9a10a78bb4a4..ebeafcce04a9 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -5,15 +5,14 @@ | |||
5 | * Zero a page. | 5 | * Zero a page. |
6 | * rdi page | 6 | * rdi page |
7 | */ | 7 | */ |
8 | ALIGN | 8 | ENTRY(clear_page_c) |
9 | clear_page_c: | ||
10 | CFI_STARTPROC | 9 | CFI_STARTPROC |
11 | movl $4096/8,%ecx | 10 | movl $4096/8,%ecx |
12 | xorl %eax,%eax | 11 | xorl %eax,%eax |
13 | rep stosq | 12 | rep stosq |
14 | ret | 13 | ret |
15 | CFI_ENDPROC | 14 | CFI_ENDPROC |
16 | ENDPROC(clear_page) | 15 | ENDPROC(clear_page_c) |
17 | 16 | ||
18 | ENTRY(clear_page) | 17 | ENTRY(clear_page) |
19 | CFI_STARTPROC | 18 | CFI_STARTPROC |
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index f118c110af32..6ba0f7bb85ea 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -75,6 +75,7 @@ ENTRY(copy_to_user) | |||
75 | jae bad_to_user | 75 | jae bad_to_user |
76 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 76 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string |
77 | CFI_ENDPROC | 77 | CFI_ENDPROC |
78 | ENDPROC(copy_to_user) | ||
78 | 79 | ||
79 | /* Standard copy_from_user with segment limit checking */ | 80 | /* Standard copy_from_user with segment limit checking */ |
80 | ENTRY(copy_from_user) | 81 | ENTRY(copy_from_user) |
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f4568605d7d5..ff485d361182 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c | |||
@@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops) | |||
55 | 55 | ||
56 | preempt_disable(); | 56 | preempt_disable(); |
57 | cpu = smp_processor_id(); | 57 | cpu = smp_processor_id(); |
58 | rdtsc_barrier(); | ||
58 | rdtscl(bclock); | 59 | rdtscl(bclock); |
59 | for (;;) { | 60 | for (;;) { |
61 | rdtsc_barrier(); | ||
60 | rdtscl(now); | 62 | rdtscl(now); |
61 | if ((now - bclock) >= loops) | 63 | if ((now - bclock) >= loops) |
62 | break; | 64 | break; |
@@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops) | |||
78 | if (unlikely(cpu != smp_processor_id())) { | 80 | if (unlikely(cpu != smp_processor_id())) { |
79 | loops -= (now - bclock); | 81 | loops -= (now - bclock); |
80 | cpu = smp_processor_id(); | 82 | cpu = smp_processor_id(); |
83 | rdtsc_barrier(); | ||
81 | rdtscl(bclock); | 84 | rdtscl(bclock); |
82 | } | 85 | } |
83 | } | 86 | } |
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 1440b9c0547e..caa24aca8115 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c | |||
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) | |||
89 | rv.msrs = msrs; | 89 | rv.msrs = msrs; |
90 | rv.msr_no = msr_no; | 90 | rv.msr_no = msr_no; |
91 | 91 | ||
92 | preempt_disable(); | 92 | this_cpu = get_cpu(); |
93 | /* | 93 | |
94 | * FIXME: handle the CPU we're executing on separately for now until | 94 | if (cpumask_test_cpu(this_cpu, mask)) |
95 | * smp_call_function_many has been fixed to not skip it. | 95 | __rdmsr_on_cpu(&rv); |
96 | */ | ||
97 | this_cpu = raw_smp_processor_id(); | ||
98 | smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1); | ||
99 | 96 | ||
100 | smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); | 97 | smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); |
101 | preempt_enable(); | 98 | put_cpu(); |
102 | } | 99 | } |
103 | EXPORT_SYMBOL(rdmsr_on_cpus); | 100 | EXPORT_SYMBOL(rdmsr_on_cpus); |
104 | 101 | ||
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) | |||
121 | rv.msrs = msrs; | 118 | rv.msrs = msrs; |
122 | rv.msr_no = msr_no; | 119 | rv.msr_no = msr_no; |
123 | 120 | ||
124 | preempt_disable(); | 121 | this_cpu = get_cpu(); |
125 | /* | 122 | |
126 | * FIXME: handle the CPU we're executing on separately for now until | 123 | if (cpumask_test_cpu(this_cpu, mask)) |
127 | * smp_call_function_many has been fixed to not skip it. | 124 | __wrmsr_on_cpu(&rv); |
128 | */ | ||
129 | this_cpu = raw_smp_processor_id(); | ||
130 | smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1); | ||
131 | 125 | ||
132 | smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); | 126 | smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); |
133 | preempt_enable(); | 127 | put_cpu(); |
134 | } | 128 | } |
135 | EXPORT_SYMBOL(wrmsr_on_cpus); | 129 | EXPORT_SYMBOL(wrmsr_on_cpus); |
136 | 130 | ||
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7c8ca91bb9ec..1f118d462acc 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c | |||
@@ -751,7 +751,7 @@ survive: | |||
751 | 751 | ||
752 | if (retval == -ENOMEM && is_global_init(current)) { | 752 | if (retval == -ENOMEM && is_global_init(current)) { |
753 | up_read(¤t->mm->mmap_sem); | 753 | up_read(¤t->mm->mmap_sem); |
754 | congestion_wait(WRITE, HZ/50); | 754 | congestion_wait(BLK_RW_ASYNC, HZ/50); |
755 | goto survive; | 755 | goto survive; |
756 | } | 756 | } |
757 | 757 | ||
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index ec13cb5f17ed..b7c2849ffb66 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c | |||
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(__strnlen_user); | |||
127 | 127 | ||
128 | long strnlen_user(const char __user *s, long n) | 128 | long strnlen_user(const char __user *s, long n) |
129 | { | 129 | { |
130 | if (!access_ok(VERIFY_READ, s, n)) | 130 | if (!access_ok(VERIFY_READ, s, 1)) |
131 | return 0; | 131 | return 0; |
132 | return __strnlen_user(s, n); | 132 | return __strnlen_user(s, n); |
133 | } | 133 | } |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index baa0e86adfbc..bfae139182ff 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address) | |||
426 | } | 426 | } |
427 | 427 | ||
428 | static const char errata93_warning[] = | 428 | static const char errata93_warning[] = |
429 | KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" | 429 | KERN_ERR |
430 | KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" | 430 | "******* Your BIOS seems to not contain a fix for K8 errata #93\n" |
431 | KERN_ERR "******* Please consider a BIOS update.\n" | 431 | "******* Working around it, but it may cause SEGVs or burn power.\n" |
432 | KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; | 432 | "******* Please consider a BIOS update.\n" |
433 | "******* Disabling USB legacy in the BIOS may also help.\n"; | ||
433 | 434 | ||
434 | /* | 435 | /* |
435 | * No vm86 mode in 64-bit mode: | 436 | * No vm86 mode in 64-bit mode: |
@@ -696,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, | |||
696 | if (!printk_ratelimit()) | 697 | if (!printk_ratelimit()) |
697 | return; | 698 | return; |
698 | 699 | ||
699 | printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", | 700 | printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", |
700 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | 701 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, |
701 | tsk->comm, task_pid_nr(tsk), address, | 702 | tsk->comm, task_pid_nr(tsk), address, |
702 | (void *)regs->ip, (void *)regs->sp, error_code); | 703 | (void *)regs->ip, (void *)regs->sp, error_code); |
@@ -952,8 +953,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
952 | tsk = current; | 953 | tsk = current; |
953 | mm = tsk->mm; | 954 | mm = tsk->mm; |
954 | 955 | ||
955 | prefetchw(&mm->mmap_sem); | ||
956 | |||
957 | /* Get the faulting address: */ | 956 | /* Get the faulting address: */ |
958 | address = read_cr2(); | 957 | address = read_cr2(); |
959 | 958 | ||
@@ -963,6 +962,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
963 | */ | 962 | */ |
964 | if (kmemcheck_active(regs)) | 963 | if (kmemcheck_active(regs)) |
965 | kmemcheck_hide(regs); | 964 | kmemcheck_hide(regs); |
965 | prefetchw(&mm->mmap_sem); | ||
966 | 966 | ||
967 | if (unlikely(kmmio_fault(regs, address))) | 967 | if (unlikely(kmmio_fault(regs, address))) |
968 | return; | 968 | return; |
@@ -1114,7 +1114,7 @@ good_area: | |||
1114 | * make sure we exit gracefully rather than endlessly redo | 1114 | * make sure we exit gracefully rather than endlessly redo |
1115 | * the fault: | 1115 | * the fault: |
1116 | */ | 1116 | */ |
1117 | fault = handle_mm_fault(mm, vma, address, write); | 1117 | fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); |
1118 | 1118 | ||
1119 | if (unlikely(fault & VM_FAULT_ERROR)) { | 1119 | if (unlikely(fault & VM_FAULT_ERROR)) { |
1120 | mm_fault_error(regs, error_code, address, fault); | 1120 | mm_fault_error(regs, error_code, address, fault); |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 6340cef6798a..71da1bca13cb 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -14,7 +14,7 @@ | |||
14 | static inline pte_t gup_get_pte(pte_t *ptep) | 14 | static inline pte_t gup_get_pte(pte_t *ptep) |
15 | { | 15 | { |
16 | #ifndef CONFIG_X86_PAE | 16 | #ifndef CONFIG_X86_PAE |
17 | return *ptep; | 17 | return ACCESS_ONCE(*ptep); |
18 | #else | 18 | #else |
19 | /* | 19 | /* |
20 | * With get_user_pages_fast, we walk down the pagetables without taking | 20 | * With get_user_pages_fast, we walk down the pagetables without taking |
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | |||
219 | return 1; | 219 | return 1; |
220 | } | 220 | } |
221 | 221 | ||
222 | /* | ||
223 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | ||
224 | * back to the regular GUP. | ||
225 | */ | ||
226 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
227 | struct page **pages) | ||
228 | { | ||
229 | struct mm_struct *mm = current->mm; | ||
230 | unsigned long addr, len, end; | ||
231 | unsigned long next; | ||
232 | unsigned long flags; | ||
233 | pgd_t *pgdp; | ||
234 | int nr = 0; | ||
235 | |||
236 | start &= PAGE_MASK; | ||
237 | addr = start; | ||
238 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
239 | end = start + len; | ||
240 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
241 | (void __user *)start, len))) | ||
242 | return 0; | ||
243 | |||
244 | /* | ||
245 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
246 | * needs some instrumenting to determine the common sizes used by | ||
247 | * important workloads (eg. DB2), and whether limiting the batch size | ||
248 | * will decrease performance. | ||
249 | * | ||
250 | * It seems like we're in the clear for the moment. Direct-IO is | ||
251 | * the main guy that batches up lots of get_user_pages, and even | ||
252 | * they are limited to 64-at-a-time which is not so many. | ||
253 | */ | ||
254 | /* | ||
255 | * This doesn't prevent pagetable teardown, but does prevent | ||
256 | * the pagetables and pages from being freed on x86. | ||
257 | * | ||
258 | * So long as we atomically load page table pointers versus teardown | ||
259 | * (which we do on x86, with the above PAE exception), we can follow the | ||
260 | * address down to the the page and take a ref on it. | ||
261 | */ | ||
262 | local_irq_save(flags); | ||
263 | pgdp = pgd_offset(mm, addr); | ||
264 | do { | ||
265 | pgd_t pgd = *pgdp; | ||
266 | |||
267 | next = pgd_addr_end(addr, end); | ||
268 | if (pgd_none(pgd)) | ||
269 | break; | ||
270 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
271 | break; | ||
272 | } while (pgdp++, addr = next, addr != end); | ||
273 | local_irq_restore(flags); | ||
274 | |||
275 | return nr; | ||
276 | } | ||
277 | |||
222 | /** | 278 | /** |
223 | * get_user_pages_fast() - pin user pages in memory | 279 | * get_user_pages_fast() - pin user pages in memory |
224 | * @start: starting user address | 280 | * @start: starting user address |
@@ -247,11 +303,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
247 | start &= PAGE_MASK; | 303 | start &= PAGE_MASK; |
248 | addr = start; | 304 | addr = start; |
249 | len = (unsigned long) nr_pages << PAGE_SHIFT; | 305 | len = (unsigned long) nr_pages << PAGE_SHIFT; |
306 | |||
250 | end = start + len; | 307 | end = start + len; |
251 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | 308 | if (end < start) |
252 | (void __user *)start, len))) | ||
253 | goto slow_irqon; | 309 | goto slow_irqon; |
254 | 310 | ||
311 | #ifdef CONFIG_X86_64 | ||
312 | if (end >> __VIRTUAL_MASK_SHIFT) | ||
313 | goto slow_irqon; | ||
314 | #endif | ||
315 | |||
255 | /* | 316 | /* |
256 | * XXX: batch / limit 'nr', to avoid large irq off latency | 317 | * XXX: batch / limit 'nr', to avoid large irq off latency |
257 | * needs some instrumenting to determine the common sizes used by | 318 | * needs some instrumenting to determine the common sizes used by |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 58f621e81919..2112ed55e7ea 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap); | |||
103 | EXPORT_SYMBOL(kunmap); | 103 | EXPORT_SYMBOL(kunmap); |
104 | EXPORT_SYMBOL(kmap_atomic); | 104 | EXPORT_SYMBOL(kmap_atomic); |
105 | EXPORT_SYMBOL(kunmap_atomic); | 105 | EXPORT_SYMBOL(kunmap_atomic); |
106 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
106 | 107 | ||
107 | void __init set_highmem_pages_init(void) | 108 | void __init set_highmem_pages_init(void) |
108 | { | 109 | { |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f53b57e4086f..0607119cef94 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <asm/system.h> | 12 | #include <asm/system.h> |
13 | #include <asm/tlbflush.h> | 13 | #include <asm/tlbflush.h> |
14 | #include <asm/tlb.h> | 14 | #include <asm/tlb.h> |
15 | #include <asm/proto.h> | ||
15 | 16 | ||
16 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 17 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
17 | 18 | ||
@@ -177,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, | |||
177 | return nr_range; | 178 | return nr_range; |
178 | } | 179 | } |
179 | 180 | ||
180 | #ifdef CONFIG_X86_64 | ||
181 | static void __init init_gbpages(void) | ||
182 | { | ||
183 | if (direct_gbpages && cpu_has_gbpages) | ||
184 | printk(KERN_INFO "Using GB pages for direct mapping\n"); | ||
185 | else | ||
186 | direct_gbpages = 0; | ||
187 | } | ||
188 | #else | ||
189 | static inline void init_gbpages(void) | ||
190 | { | ||
191 | } | ||
192 | #endif | ||
193 | |||
194 | /* | 181 | /* |
195 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. | 182 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. |
196 | * This runs before bootmem is initialized and gets pages directly from | 183 | * This runs before bootmem is initialized and gets pages directly from |
@@ -210,9 +197,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
210 | 197 | ||
211 | printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); | 198 | printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); |
212 | 199 | ||
213 | if (!after_bootmem) | ||
214 | init_gbpages(); | ||
215 | |||
216 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) | 200 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) |
217 | /* | 201 | /* |
218 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | 202 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9c543290a813..ea56b8cbb6a6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -527,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
527 | return phys_pud_init(pud, addr, end, page_size_mask); | 527 | return phys_pud_init(pud, addr, end, page_size_mask); |
528 | } | 528 | } |
529 | 529 | ||
530 | unsigned long __init | 530 | unsigned long __meminit |
531 | kernel_physical_mapping_init(unsigned long start, | 531 | kernel_physical_mapping_init(unsigned long start, |
532 | unsigned long end, | 532 | unsigned long end, |
533 | unsigned long page_size_mask) | 533 | unsigned long page_size_mask) |
@@ -598,6 +598,15 @@ void __init paging_init(void) | |||
598 | 598 | ||
599 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | 599 | sparse_memory_present_with_active_regions(MAX_NUMNODES); |
600 | sparse_init(); | 600 | sparse_init(); |
601 | |||
602 | /* | ||
603 | * clear the default setting with node 0 | ||
604 | * note: don't use nodes_clear here, that is really clearing when | ||
605 | * numa support is not compiled in, and later node_set_state | ||
606 | * will not set it back. | ||
607 | */ | ||
608 | node_clear_state(0, N_NORMAL_MEMORY); | ||
609 | |||
601 | free_area_init_nodes(max_zone_pfns); | 610 | free_area_init_nodes(max_zone_pfns); |
602 | } | 611 | } |
603 | 612 | ||
@@ -787,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | |||
787 | return ret; | 796 | return ret; |
788 | 797 | ||
789 | #else | 798 | #else |
790 | reserve_bootmem(phys, len, BOOTMEM_DEFAULT); | 799 | reserve_bootmem(phys, len, flags); |
791 | #endif | 800 | #endif |
792 | 801 | ||
793 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | 802 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cfe9ced8a4c..7e600c1962db 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
14 | #include <linux/pfn.h> | ||
14 | 15 | ||
15 | #include <asm/e820.h> | 16 | #include <asm/e820.h> |
16 | #include <asm/processor.h> | 17 | #include <asm/processor.h> |
@@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) | |||
590 | unsigned int level; | 591 | unsigned int level; |
591 | pte_t *kpte, old_pte; | 592 | pte_t *kpte, old_pte; |
592 | 593 | ||
593 | if (cpa->flags & CPA_PAGES_ARRAY) | 594 | if (cpa->flags & CPA_PAGES_ARRAY) { |
594 | address = (unsigned long)page_address(cpa->pages[cpa->curpage]); | 595 | struct page *page = cpa->pages[cpa->curpage]; |
595 | else if (cpa->flags & CPA_ARRAY) | 596 | if (unlikely(PageHighMem(page))) |
597 | return 0; | ||
598 | address = (unsigned long)page_address(page); | ||
599 | } else if (cpa->flags & CPA_ARRAY) | ||
596 | address = cpa->vaddr[cpa->curpage]; | 600 | address = cpa->vaddr[cpa->curpage]; |
597 | else | 601 | else |
598 | address = *cpa->vaddr; | 602 | address = *cpa->vaddr; |
@@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); | |||
681 | static int cpa_process_alias(struct cpa_data *cpa) | 685 | static int cpa_process_alias(struct cpa_data *cpa) |
682 | { | 686 | { |
683 | struct cpa_data alias_cpa; | 687 | struct cpa_data alias_cpa; |
684 | int ret = 0; | 688 | unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); |
685 | unsigned long temp_cpa_vaddr, vaddr; | 689 | unsigned long vaddr, remapped; |
690 | int ret; | ||
686 | 691 | ||
687 | if (cpa->pfn >= max_pfn_mapped) | 692 | if (cpa->pfn >= max_pfn_mapped) |
688 | return 0; | 693 | return 0; |
@@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
695 | * No need to redo, when the primary call touched the direct | 700 | * No need to redo, when the primary call touched the direct |
696 | * mapping already: | 701 | * mapping already: |
697 | */ | 702 | */ |
698 | if (cpa->flags & CPA_PAGES_ARRAY) | 703 | if (cpa->flags & CPA_PAGES_ARRAY) { |
699 | vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); | 704 | struct page *page = cpa->pages[cpa->curpage]; |
700 | else if (cpa->flags & CPA_ARRAY) | 705 | if (unlikely(PageHighMem(page))) |
706 | return 0; | ||
707 | vaddr = (unsigned long)page_address(page); | ||
708 | } else if (cpa->flags & CPA_ARRAY) | ||
701 | vaddr = cpa->vaddr[cpa->curpage]; | 709 | vaddr = cpa->vaddr[cpa->curpage]; |
702 | else | 710 | else |
703 | vaddr = *cpa->vaddr; | 711 | vaddr = *cpa->vaddr; |
@@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
706 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { | 714 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { |
707 | 715 | ||
708 | alias_cpa = *cpa; | 716 | alias_cpa = *cpa; |
709 | temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); | 717 | alias_cpa.vaddr = &laddr; |
710 | alias_cpa.vaddr = &temp_cpa_vaddr; | ||
711 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | 718 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); |
712 | 719 | ||
713 | |||
714 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | 720 | ret = __change_page_attr_set_clr(&alias_cpa, 0); |
721 | if (ret) | ||
722 | return ret; | ||
715 | } | 723 | } |
716 | 724 | ||
717 | #ifdef CONFIG_X86_64 | 725 | #ifdef CONFIG_X86_64 |
718 | if (ret) | ||
719 | return ret; | ||
720 | /* | ||
721 | * No need to redo, when the primary call touched the high | ||
722 | * mapping already: | ||
723 | */ | ||
724 | if (within(vaddr, (unsigned long) _text, _brk_end)) | ||
725 | return 0; | ||
726 | |||
727 | /* | 726 | /* |
728 | * If the physical address is inside the kernel map, we need | 727 | * If the primary call didn't touch the high mapping already |
728 | * and the physical address is inside the kernel map, we need | ||
729 | * to touch the high mapped kernel as well: | 729 | * to touch the high mapped kernel as well: |
730 | */ | 730 | */ |
731 | if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) | 731 | if (!within(vaddr, (unsigned long)_text, _brk_end) && |
732 | return 0; | 732 | within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { |
733 | unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + | ||
734 | __START_KERNEL_map - phys_base; | ||
735 | alias_cpa = *cpa; | ||
736 | alias_cpa.vaddr = &temp_cpa_vaddr; | ||
737 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | ||
733 | 738 | ||
734 | alias_cpa = *cpa; | 739 | /* |
735 | temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; | 740 | * The high mapping range is imprecise, so ignore the |
736 | alias_cpa.vaddr = &temp_cpa_vaddr; | 741 | * return value. |
737 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | 742 | */ |
743 | __change_page_attr_set_clr(&alias_cpa, 0); | ||
744 | } | ||
745 | #endif | ||
738 | 746 | ||
739 | /* | 747 | /* |
740 | * The high mapping range is imprecise, so ignore the return value. | 748 | * If the PMD page was partially used for per-cpu remapping, |
749 | * the recycled area needs to be split and modified. Because | ||
750 | * the area is always proper subset of a PMD page | ||
751 | * cpa->numpages is guaranteed to be 1 for these areas, so | ||
752 | * there's no need to loop over and check for further remaps. | ||
741 | */ | 753 | */ |
742 | __change_page_attr_set_clr(&alias_cpa, 0); | 754 | remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); |
743 | #endif | 755 | if (remapped) { |
744 | return ret; | 756 | WARN_ON(cpa->numpages > 1); |
757 | alias_cpa = *cpa; | ||
758 | alias_cpa.vaddr = &remapped; | ||
759 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | ||
760 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | ||
761 | if (ret) | ||
762 | return ret; | ||
763 | } | ||
764 | |||
765 | return 0; | ||
745 | } | 766 | } |
746 | 767 | ||
747 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | 768 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) |
@@ -982,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc); | |||
982 | int _set_memory_wc(unsigned long addr, int numpages) | 1003 | int _set_memory_wc(unsigned long addr, int numpages) |
983 | { | 1004 | { |
984 | int ret; | 1005 | int ret; |
1006 | unsigned long addr_copy = addr; | ||
1007 | |||
985 | ret = change_page_attr_set(&addr, numpages, | 1008 | ret = change_page_attr_set(&addr, numpages, |
986 | __pgprot(_PAGE_CACHE_UC_MINUS), 0); | 1009 | __pgprot(_PAGE_CACHE_UC_MINUS), 0); |
987 | |||
988 | if (!ret) { | 1010 | if (!ret) { |
989 | ret = change_page_attr_set(&addr, numpages, | 1011 | ret = change_page_attr_set_clr(&addr_copy, numpages, |
990 | __pgprot(_PAGE_CACHE_WC), 0); | 1012 | __pgprot(_PAGE_CACHE_WC), |
1013 | __pgprot(_PAGE_CACHE_MASK), | ||
1014 | 0, 0, NULL); | ||
991 | } | 1015 | } |
992 | return ret; | 1016 | return ret; |
993 | } | 1017 | } |
@@ -1104,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) | |||
1104 | int free_idx; | 1128 | int free_idx; |
1105 | 1129 | ||
1106 | for (i = 0; i < addrinarray; i++) { | 1130 | for (i = 0; i < addrinarray; i++) { |
1107 | start = (unsigned long)page_address(pages[i]); | 1131 | if (PageHighMem(pages[i])) |
1132 | continue; | ||
1133 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | ||
1108 | end = start + PAGE_SIZE; | 1134 | end = start + PAGE_SIZE; |
1109 | if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) | 1135 | if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) |
1110 | goto err_out; | 1136 | goto err_out; |
@@ -1117,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) | |||
1117 | err_out: | 1143 | err_out: |
1118 | free_idx = i; | 1144 | free_idx = i; |
1119 | for (i = 0; i < free_idx; i++) { | 1145 | for (i = 0; i < free_idx; i++) { |
1120 | start = (unsigned long)page_address(pages[i]); | 1146 | if (PageHighMem(pages[i])) |
1147 | continue; | ||
1148 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | ||
1121 | end = start + PAGE_SIZE; | 1149 | end = start + PAGE_SIZE; |
1122 | free_memtype(start, end); | 1150 | free_memtype(start, end); |
1123 | } | 1151 | } |
@@ -1146,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray) | |||
1146 | return retval; | 1174 | return retval; |
1147 | 1175 | ||
1148 | for (i = 0; i < addrinarray; i++) { | 1176 | for (i = 0; i < addrinarray; i++) { |
1149 | start = (unsigned long)page_address(pages[i]); | 1177 | if (PageHighMem(pages[i])) |
1178 | continue; | ||
1179 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | ||
1150 | end = start + PAGE_SIZE; | 1180 | end = start + PAGE_SIZE; |
1151 | free_memtype(start, end); | 1181 | free_memtype(start, end); |
1152 | } | 1182 | } |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb28065..352aa9e927e2 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -623,7 +623,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, | |||
623 | return ret; | 623 | return ret; |
624 | 624 | ||
625 | if (flags != want_flags) { | 625 | if (flags != want_flags) { |
626 | if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { | 626 | if (strict_prot || |
627 | !is_new_memtype_allowed(paddr, size, want_flags, flags)) { | ||
627 | free_memtype(paddr, paddr + size); | 628 | free_memtype(paddr, paddr + size); |
628 | printk(KERN_ERR "%s:%d map pfn expected mapping type %s" | 629 | printk(KERN_ERR "%s:%d map pfn expected mapping type %s" |
629 | " for %Lx-%Lx, got %s\n", | 630 | " for %Lx-%Lx, got %s\n", |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8e43bdd45456..ed34f5e35999 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | |||
25 | return pte; | 25 | return pte; |
26 | } | 26 | } |
27 | 27 | ||
28 | void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | 28 | void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) |
29 | { | 29 | { |
30 | pgtable_page_dtor(pte); | 30 | pgtable_page_dtor(pte); |
31 | paravirt_release_pte(page_to_pfn(pte)); | 31 | paravirt_release_pte(page_to_pfn(pte)); |
@@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | |||
33 | } | 33 | } |
34 | 34 | ||
35 | #if PAGETABLE_LEVELS > 2 | 35 | #if PAGETABLE_LEVELS > 2 |
36 | void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | 36 | void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) |
37 | { | 37 | { |
38 | paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); | 38 | paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); |
39 | tlb_remove_page(tlb, virt_to_page(pmd)); | 39 | tlb_remove_page(tlb, virt_to_page(pmd)); |
40 | } | 40 | } |
41 | 41 | ||
42 | #if PAGETABLE_LEVELS > 3 | 42 | #if PAGETABLE_LEVELS > 3 |
43 | void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) | 43 | void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) |
44 | { | 44 | { |
45 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); | 45 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); |
46 | tlb_remove_page(tlb, virt_to_page(pud)); | 46 | tlb_remove_page(tlb, virt_to_page(pud)); |
@@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve) | |||
329 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", | 329 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", |
330 | (int)-reserve); | 330 | (int)-reserve); |
331 | __FIXADDR_TOP = -reserve - PAGE_SIZE; | 331 | __FIXADDR_TOP = -reserve - PAGE_SIZE; |
332 | __VMALLOC_RESERVE += reserve; | ||
333 | #endif | 332 | #endif |
334 | } | 333 | } |
335 | 334 | ||
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 2dfcbf9df2ae..dbb5381f7b3b 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -79,8 +79,10 @@ static __init void bad_srat(void) | |||
79 | acpi_numa = -1; | 79 | acpi_numa = -1; |
80 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 80 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
81 | apicid_to_node[i] = NUMA_NO_NODE; | 81 | apicid_to_node[i] = NUMA_NO_NODE; |
82 | for (i = 0; i < MAX_NUMNODES; i++) | 82 | for (i = 0; i < MAX_NUMNODES; i++) { |
83 | nodes_add[i].start = nodes[i].end = 0; | 83 | nodes[i].start = nodes[i].end = 0; |
84 | nodes_add[i].start = nodes_add[i].end = 0; | ||
85 | } | ||
84 | remove_all_active_ranges(); | 86 | remove_all_active_ranges(); |
85 | } | 87 | } |
86 | 88 | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 821e97017e95..c814e144a3f0 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
183 | 183 | ||
184 | f->flush_mm = mm; | 184 | f->flush_mm = mm; |
185 | f->flush_va = va; | 185 | f->flush_va = va; |
186 | cpumask_andnot(to_cpumask(f->flush_cpumask), | 186 | if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { |
187 | cpumask, cpumask_of(smp_processor_id())); | 187 | /* |
188 | 188 | * We have to send the IPI only to | |
189 | /* | 189 | * CPUs affected. |
190 | * We have to send the IPI only to | 190 | */ |
191 | * CPUs affected. | 191 | apic->send_IPI_mask(to_cpumask(f->flush_cpumask), |
192 | */ | 192 | INVALIDATE_TLB_VECTOR_START + sender); |
193 | apic->send_IPI_mask(to_cpumask(f->flush_cpumask), | ||
194 | INVALIDATE_TLB_VECTOR_START + sender); | ||
195 | 193 | ||
196 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) | 194 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) |
197 | cpu_relax(); | 195 | cpu_relax(); |
196 | } | ||
198 | 197 | ||
199 | f->flush_mm = NULL; | 198 | f->flush_mm = NULL; |
200 | f->flush_va = 0; | 199 | f->flush_va = 0; |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b07dd8d0b321..89b9a5cd63da 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type) | |||
390 | static int force_arch_perfmon; | 390 | static int force_arch_perfmon; |
391 | static int force_cpu_type(const char *str, struct kernel_param *kp) | 391 | static int force_cpu_type(const char *str, struct kernel_param *kp) |
392 | { | 392 | { |
393 | if (!strcmp(str, "archperfmon")) { | 393 | if (!strcmp(str, "arch_perfmon")) { |
394 | force_arch_perfmon = 1; | 394 | force_arch_perfmon = 1; |
395 | printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); | 395 | printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); |
396 | } | 396 | } |
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c0ecf250fe51..1014eb4bfc37 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c | |||
@@ -38,15 +38,26 @@ count_resource(struct acpi_resource *acpi_res, void *data) | |||
38 | struct acpi_resource_address64 addr; | 38 | struct acpi_resource_address64 addr; |
39 | acpi_status status; | 39 | acpi_status status; |
40 | 40 | ||
41 | if (info->res_num >= PCI_BUS_NUM_RESOURCES) | ||
42 | return AE_OK; | ||
43 | |||
44 | status = resource_to_addr(acpi_res, &addr); | 41 | status = resource_to_addr(acpi_res, &addr); |
45 | if (ACPI_SUCCESS(status)) | 42 | if (ACPI_SUCCESS(status)) |
46 | info->res_num++; | 43 | info->res_num++; |
47 | return AE_OK; | 44 | return AE_OK; |
48 | } | 45 | } |
49 | 46 | ||
47 | static int | ||
48 | bus_has_transparent_bridge(struct pci_bus *bus) | ||
49 | { | ||
50 | struct pci_dev *dev; | ||
51 | |||
52 | list_for_each_entry(dev, &bus->devices, bus_list) { | ||
53 | u16 class = dev->class >> 8; | ||
54 | |||
55 | if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) | ||
56 | return true; | ||
57 | } | ||
58 | return false; | ||
59 | } | ||
60 | |||
50 | static acpi_status | 61 | static acpi_status |
51 | setup_resource(struct acpi_resource *acpi_res, void *data) | 62 | setup_resource(struct acpi_resource *acpi_res, void *data) |
52 | { | 63 | { |
@@ -56,9 +67,11 @@ setup_resource(struct acpi_resource *acpi_res, void *data) | |||
56 | acpi_status status; | 67 | acpi_status status; |
57 | unsigned long flags; | 68 | unsigned long flags; |
58 | struct resource *root; | 69 | struct resource *root; |
70 | int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; | ||
71 | u64 start, end; | ||
59 | 72 | ||
60 | if (info->res_num >= PCI_BUS_NUM_RESOURCES) | 73 | if (bus_has_transparent_bridge(info->bus)) |
61 | return AE_OK; | 74 | max_root_bus_resources -= 3; |
62 | 75 | ||
63 | status = resource_to_addr(acpi_res, &addr); | 76 | status = resource_to_addr(acpi_res, &addr); |
64 | if (!ACPI_SUCCESS(status)) | 77 | if (!ACPI_SUCCESS(status)) |
@@ -75,11 +88,22 @@ setup_resource(struct acpi_resource *acpi_res, void *data) | |||
75 | } else | 88 | } else |
76 | return AE_OK; | 89 | return AE_OK; |
77 | 90 | ||
91 | start = addr.minimum + addr.translation_offset; | ||
92 | end = start + addr.address_length - 1; | ||
93 | if (info->res_num >= max_root_bus_resources) { | ||
94 | printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " | ||
95 | "from %s for %s due to _CRS returning more than " | ||
96 | "%d resource descriptors\n", (unsigned long) start, | ||
97 | (unsigned long) end, root->name, info->name, | ||
98 | max_root_bus_resources); | ||
99 | return AE_OK; | ||
100 | } | ||
101 | |||
78 | res = &info->res[info->res_num]; | 102 | res = &info->res[info->res_num]; |
79 | res->name = info->name; | 103 | res->name = info->name; |
80 | res->flags = flags; | 104 | res->flags = flags; |
81 | res->start = addr.minimum + addr.translation_offset; | 105 | res->start = start; |
82 | res->end = res->start + addr.address_length - 1; | 106 | res->end = end; |
83 | res->child = NULL; | 107 | res->child = NULL; |
84 | 108 | ||
85 | if (insert_resource(root, res)) { | 109 | if (insert_resource(root, res)) { |
@@ -94,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) | |||
94 | } | 118 | } |
95 | 119 | ||
96 | static void | 120 | static void |
97 | adjust_transparent_bridge_resources(struct pci_bus *bus) | ||
98 | { | ||
99 | struct pci_dev *dev; | ||
100 | |||
101 | list_for_each_entry(dev, &bus->devices, bus_list) { | ||
102 | int i; | ||
103 | u16 class = dev->class >> 8; | ||
104 | |||
105 | if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) { | ||
106 | for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++) | ||
107 | dev->subordinate->resource[i] = | ||
108 | dev->bus->resource[i - 3]; | ||
109 | } | ||
110 | } | ||
111 | } | ||
112 | |||
113 | static void | ||
114 | get_current_resources(struct acpi_device *device, int busnum, | 121 | get_current_resources(struct acpi_device *device, int busnum, |
115 | int domain, struct pci_bus *bus) | 122 | int domain, struct pci_bus *bus) |
116 | { | 123 | { |
@@ -137,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum, | |||
137 | info.res_num = 0; | 144 | info.res_num = 0; |
138 | acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, | 145 | acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, |
139 | &info); | 146 | &info); |
140 | if (info.res_num) | ||
141 | adjust_transparent_bridge_resources(bus); | ||
142 | 147 | ||
143 | return; | 148 | return; |
144 | 149 | ||
@@ -201,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do | |||
201 | */ | 206 | */ |
202 | memcpy(bus->sysdata, sd, sizeof(*sd)); | 207 | memcpy(bus->sysdata, sd, sizeof(*sd)); |
203 | kfree(sd); | 208 | kfree(sd); |
204 | } else | 209 | } else { |
205 | bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); | 210 | bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); |
211 | if (bus) { | ||
212 | if (pci_probe & PCI_USE__CRS) | ||
213 | get_current_resources(device, busnum, domain, | ||
214 | bus); | ||
215 | bus->subordinate = pci_scan_child_bus(bus); | ||
216 | } | ||
217 | } | ||
206 | 218 | ||
207 | if (!bus) | 219 | if (!bus) |
208 | kfree(sd); | 220 | kfree(sd); |
@@ -217,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do | |||
217 | #endif | 229 | #endif |
218 | } | 230 | } |
219 | 231 | ||
220 | if (bus && (pci_probe & PCI_USE__CRS)) | ||
221 | get_current_resources(device, busnum, domain, bus); | ||
222 | return bus; | 232 | return bus; |
223 | } | 233 | } |
224 | 234 | ||
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index f893d6a6e803..3ffa10df20b9 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c | |||
@@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) | |||
100 | int j; | 100 | int j; |
101 | struct pci_root_info *info; | 101 | struct pci_root_info *info; |
102 | 102 | ||
103 | /* don't go for it if _CRS is used */ | 103 | /* don't go for it if _CRS is used already */ |
104 | if (pci_probe & PCI_USE__CRS) | 104 | if (b->resource[0] != &ioport_resource || |
105 | b->resource[1] != &iomem_resource) | ||
105 | return; | 106 | return; |
106 | 107 | ||
107 | /* if only one root bus, don't need to anything */ | 108 | /* if only one root bus, don't need to anything */ |
@@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b) | |||
116 | if (i == pci_root_num) | 117 | if (i == pci_root_num) |
117 | return; | 118 | return; |
118 | 119 | ||
120 | printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", | ||
121 | b->number); | ||
122 | |||
119 | info = &pci_root_info[i]; | 123 | info = &pci_root_info[i]; |
120 | for (j = 0; j < info->res_num; j++) { | 124 | for (j = 0; j < info->res_num; j++) { |
121 | struct resource *res; | 125 | struct resource *res; |
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index a85bef20a3b9..52e62e57fedd 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <asm/pat.h> | 35 | #include <asm/pat.h> |
36 | #include <asm/e820.h> | 36 | #include <asm/e820.h> |
37 | #include <asm/pci_x86.h> | 37 | #include <asm/pci_x86.h> |
38 | #include <asm/io_apic.h> | ||
38 | 39 | ||
39 | 40 | ||
40 | static int | 41 | static int |
@@ -116,7 +117,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) | |||
116 | struct pci_bus *bus; | 117 | struct pci_bus *bus; |
117 | struct pci_dev *dev; | 118 | struct pci_dev *dev; |
118 | int idx; | 119 | int idx; |
119 | struct resource *r, *pr; | 120 | struct resource *r; |
120 | 121 | ||
121 | /* Depth-First Search on bus tree */ | 122 | /* Depth-First Search on bus tree */ |
122 | list_for_each_entry(bus, bus_list, node) { | 123 | list_for_each_entry(bus, bus_list, node) { |
@@ -126,9 +127,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) | |||
126 | r = &dev->resource[idx]; | 127 | r = &dev->resource[idx]; |
127 | if (!r->flags) | 128 | if (!r->flags) |
128 | continue; | 129 | continue; |
129 | pr = pci_find_parent_resource(dev, r); | 130 | if (!r->start || |
130 | if (!r->start || !pr || | 131 | pci_claim_resource(dev, idx) < 0) { |
131 | request_resource(pr, r) < 0) { | ||
132 | dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); | 132 | dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); |
133 | /* | 133 | /* |
134 | * Something is wrong with the region. | 134 | * Something is wrong with the region. |
@@ -149,7 +149,7 @@ static void __init pcibios_allocate_resources(int pass) | |||
149 | struct pci_dev *dev = NULL; | 149 | struct pci_dev *dev = NULL; |
150 | int idx, disabled; | 150 | int idx, disabled; |
151 | u16 command; | 151 | u16 command; |
152 | struct resource *r, *pr; | 152 | struct resource *r; |
153 | 153 | ||
154 | for_each_pci_dev(dev) { | 154 | for_each_pci_dev(dev) { |
155 | pci_read_config_word(dev, PCI_COMMAND, &command); | 155 | pci_read_config_word(dev, PCI_COMMAND, &command); |
@@ -168,8 +168,7 @@ static void __init pcibios_allocate_resources(int pass) | |||
168 | (unsigned long long) r->start, | 168 | (unsigned long long) r->start, |
169 | (unsigned long long) r->end, | 169 | (unsigned long long) r->end, |
170 | r->flags, disabled, pass); | 170 | r->flags, disabled, pass); |
171 | pr = pci_find_parent_resource(dev, r); | 171 | if (pci_claim_resource(dev, idx) < 0) { |
172 | if (!pr || request_resource(pr, r) < 0) { | ||
173 | dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); | 172 | dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); |
174 | /* We'll assign a new address later */ | 173 | /* We'll assign a new address later */ |
175 | r->end -= r->start; | 174 | r->end -= r->start; |
@@ -197,7 +196,7 @@ static void __init pcibios_allocate_resources(int pass) | |||
197 | static int __init pcibios_assign_resources(void) | 196 | static int __init pcibios_assign_resources(void) |
198 | { | 197 | { |
199 | struct pci_dev *dev = NULL; | 198 | struct pci_dev *dev = NULL; |
200 | struct resource *r, *pr; | 199 | struct resource *r; |
201 | 200 | ||
202 | if (!(pci_probe & PCI_ASSIGN_ROMS)) { | 201 | if (!(pci_probe & PCI_ASSIGN_ROMS)) { |
203 | /* | 202 | /* |
@@ -209,8 +208,7 @@ static int __init pcibios_assign_resources(void) | |||
209 | r = &dev->resource[PCI_ROM_RESOURCE]; | 208 | r = &dev->resource[PCI_ROM_RESOURCE]; |
210 | if (!r->flags || !r->start) | 209 | if (!r->flags || !r->start) |
211 | continue; | 210 | continue; |
212 | pr = pci_find_parent_resource(dev, r); | 211 | if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { |
213 | if (!pr || request_resource(pr, r) < 0) { | ||
214 | r->end -= r->start; | 212 | r->end -= r->start; |
215 | r->start = 0; | 213 | r->start = 0; |
216 | } | 214 | } |
@@ -230,6 +228,12 @@ void __init pcibios_resource_survey(void) | |||
230 | pcibios_allocate_resources(1); | 228 | pcibios_allocate_resources(1); |
231 | 229 | ||
232 | e820_reserve_resources_late(); | 230 | e820_reserve_resources_late(); |
231 | /* | ||
232 | * Insert the IO APIC resources after PCI initialization has | ||
233 | * occured to handle IO APICS that are mapped in on a BAR in | ||
234 | * PCI space, but before trying to assign unassigned pci res. | ||
235 | */ | ||
236 | ioapic_insert_resources(); | ||
233 | } | 237 | } |
234 | 238 | ||
235 | /** | 239 | /** |
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8766b0e216c5..712443ec6d43 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c | |||
@@ -523,6 +523,69 @@ reject: | |||
523 | 523 | ||
524 | static int __initdata known_bridge; | 524 | static int __initdata known_bridge; |
525 | 525 | ||
526 | static int acpi_mcfg_64bit_base_addr __initdata = FALSE; | ||
527 | |||
528 | /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ | ||
529 | struct acpi_mcfg_allocation *pci_mmcfg_config; | ||
530 | int pci_mmcfg_config_num; | ||
531 | |||
532 | static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) | ||
533 | { | ||
534 | if (!strcmp(mcfg->header.oem_id, "SGI")) | ||
535 | acpi_mcfg_64bit_base_addr = TRUE; | ||
536 | |||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | static int __init pci_parse_mcfg(struct acpi_table_header *header) | ||
541 | { | ||
542 | struct acpi_table_mcfg *mcfg; | ||
543 | unsigned long i; | ||
544 | int config_size; | ||
545 | |||
546 | if (!header) | ||
547 | return -EINVAL; | ||
548 | |||
549 | mcfg = (struct acpi_table_mcfg *)header; | ||
550 | |||
551 | /* how many config structures do we have */ | ||
552 | pci_mmcfg_config_num = 0; | ||
553 | i = header->length - sizeof(struct acpi_table_mcfg); | ||
554 | while (i >= sizeof(struct acpi_mcfg_allocation)) { | ||
555 | ++pci_mmcfg_config_num; | ||
556 | i -= sizeof(struct acpi_mcfg_allocation); | ||
557 | }; | ||
558 | if (pci_mmcfg_config_num == 0) { | ||
559 | printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); | ||
560 | return -ENODEV; | ||
561 | } | ||
562 | |||
563 | config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); | ||
564 | pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); | ||
565 | if (!pci_mmcfg_config) { | ||
566 | printk(KERN_WARNING PREFIX | ||
567 | "No memory for MCFG config tables\n"); | ||
568 | return -ENOMEM; | ||
569 | } | ||
570 | |||
571 | memcpy(pci_mmcfg_config, &mcfg[1], config_size); | ||
572 | |||
573 | acpi_mcfg_oem_check(mcfg); | ||
574 | |||
575 | for (i = 0; i < pci_mmcfg_config_num; ++i) { | ||
576 | if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && | ||
577 | !acpi_mcfg_64bit_base_addr) { | ||
578 | printk(KERN_ERR PREFIX | ||
579 | "MMCONFIG not in low 4GB of memory\n"); | ||
580 | kfree(pci_mmcfg_config); | ||
581 | pci_mmcfg_config_num = 0; | ||
582 | return -ENODEV; | ||
583 | } | ||
584 | } | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
526 | static void __init __pci_mmcfg_init(int early) | 589 | static void __init __pci_mmcfg_init(int early) |
527 | { | 590 | { |
528 | /* MMCONFIG disabled */ | 591 | /* MMCONFIG disabled */ |
@@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early) | |||
543 | } | 606 | } |
544 | 607 | ||
545 | if (!known_bridge) | 608 | if (!known_bridge) |
546 | acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); | 609 | acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); |
547 | 610 | ||
548 | pci_mmcfg_reject_broken(early); | 611 | pci_mmcfg_reject_broken(early); |
549 | 612 | ||
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index de2abbd07544..a6a198c33623 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile | |||
@@ -1,7 +1,7 @@ | |||
1 | # __restore_processor_state() restores %gs after S3 resume and so should not | 1 | # __restore_processor_state() restores %gs after S3 resume and so should not |
2 | # itself be stack-protected | 2 | # itself be stack-protected |
3 | nostackp := $(call cc-option, -fno-stack-protector) | 3 | nostackp := $(call cc-option, -fno-stack-protector) |
4 | CFLAGS_cpu_$(BITS).o := $(nostackp) | 4 | CFLAGS_cpu.o := $(nostackp) |
5 | 5 | ||
6 | obj-$(CONFIG_PM_SLEEP) += cpu.o | 6 | obj-$(CONFIG_PM_SLEEP) += cpu.o |
7 | obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o | 7 | obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 394cbb88987c..9e63db8cdee4 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -226,7 +226,7 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
226 | do_fpu_end(); | 226 | do_fpu_end(); |
227 | mtrr_ap_init(); | 227 | mtrr_ap_init(); |
228 | 228 | ||
229 | #ifdef CONFIG_X86_32 | 229 | #ifdef CONFIG_X86_OLD_MCE |
230 | mcheck_init(&boot_cpu_data); | 230 | mcheck_init(&boot_cpu_data); |
231 | #endif | 231 | #endif |
232 | } | 232 | } |
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 16a9020c8f11..88112b49f02c 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
@@ -123,6 +123,7 @@ quiet_cmd_vdso = VDSO $@ | |||
123 | -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) | 123 | -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) |
124 | 124 | ||
125 | VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) | 125 | VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) |
126 | GCOV_PROFILE := n | ||
126 | 127 | ||
127 | # | 128 | # |
128 | # Install the unstripped copy of vdso*.so listed in $(vdso-install-y). | 129 | # Install the unstripped copy of vdso*.so listed in $(vdso-install-y). |
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 172438f86a02..7410640db173 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -5,6 +5,10 @@ CFLAGS_REMOVE_time.o = -pg | |||
5 | CFLAGS_REMOVE_irq.o = -pg | 5 | CFLAGS_REMOVE_irq.o = -pg |
6 | endif | 6 | endif |
7 | 7 | ||
8 | # Make sure early boot has no stackprotector | ||
9 | nostackp := $(call cc-option, -fno-stack-protector) | ||
10 | CFLAGS_enlighten.o := $(nostackp) | ||
11 | |||
8 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | 12 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ |
9 | time.o xen-asm.o xen-asm_$(BITS).o \ | 13 | time.o xen-asm.o xen-asm_$(BITS).o \ |
10 | grant-table.o suspend.o | 14 | grant-table.o suspend.o |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0a1700a2be9c..eb33aaa8415d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -215,6 +215,7 @@ static __init void xen_init_cpuid_mask(void) | |||
215 | (1 << X86_FEATURE_ACPI)); /* disable ACPI */ | 215 | (1 << X86_FEATURE_ACPI)); /* disable ACPI */ |
216 | 216 | ||
217 | ax = 1; | 217 | ax = 1; |
218 | cx = 0; | ||
218 | xen_cpuid(&ax, &bx, &cx, &dx); | 219 | xen_cpuid(&ax, &bx, &cx, &dx); |
219 | 220 | ||
220 | /* cpuid claims we support xsave; try enabling it to see what happens */ | 221 | /* cpuid claims we support xsave; try enabling it to see what happens */ |
@@ -974,10 +975,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
974 | 975 | ||
975 | xen_domain_type = XEN_PV_DOMAIN; | 976 | xen_domain_type = XEN_PV_DOMAIN; |
976 | 977 | ||
977 | BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); | ||
978 | |||
979 | xen_setup_features(); | ||
980 | |||
981 | /* Install Xen paravirt ops */ | 978 | /* Install Xen paravirt ops */ |
982 | pv_info = xen_info; | 979 | pv_info = xen_info; |
983 | pv_init_ops = xen_init_ops; | 980 | pv_init_ops = xen_init_ops; |
@@ -986,8 +983,15 @@ asmlinkage void __init xen_start_kernel(void) | |||
986 | pv_apic_ops = xen_apic_ops; | 983 | pv_apic_ops = xen_apic_ops; |
987 | pv_mmu_ops = xen_mmu_ops; | 984 | pv_mmu_ops = xen_mmu_ops; |
988 | 985 | ||
989 | xen_init_irq_ops(); | 986 | #ifdef CONFIG_X86_64 |
987 | /* | ||
988 | * Setup percpu state. We only need to do this for 64-bit | ||
989 | * because 32-bit already has %fs set properly. | ||
990 | */ | ||
991 | load_percpu_segment(0); | ||
992 | #endif | ||
990 | 993 | ||
994 | xen_init_irq_ops(); | ||
991 | xen_init_cpuid_mask(); | 995 | xen_init_cpuid_mask(); |
992 | 996 | ||
993 | #ifdef CONFIG_X86_LOCAL_APIC | 997 | #ifdef CONFIG_X86_LOCAL_APIC |
@@ -997,6 +1001,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
997 | set_xen_basic_apic_ops(); | 1001 | set_xen_basic_apic_ops(); |
998 | #endif | 1002 | #endif |
999 | 1003 | ||
1004 | xen_setup_features(); | ||
1005 | |||
1000 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { | 1006 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { |
1001 | pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; | 1007 | pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; |
1002 | pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; | 1008 | pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; |
@@ -1004,13 +1010,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1004 | 1010 | ||
1005 | machine_ops = xen_machine_ops; | 1011 | machine_ops = xen_machine_ops; |
1006 | 1012 | ||
1007 | #ifdef CONFIG_X86_64 | ||
1008 | /* | ||
1009 | * Setup percpu state. We only need to do this for 64-bit | ||
1010 | * because 32-bit already has %fs set properly. | ||
1011 | */ | ||
1012 | load_percpu_segment(0); | ||
1013 | #endif | ||
1014 | /* | 1013 | /* |
1015 | * The only reliable way to retain the initial address of the | 1014 | * The only reliable way to retain the initial address of the |
1016 | * percpu gdt_page is to remember it here, so we can go and | 1015 | * percpu gdt_page is to remember it here, so we can go and |
@@ -1061,6 +1060,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1061 | /* set up basic CPUID stuff */ | 1060 | /* set up basic CPUID stuff */ |
1062 | cpu_detect(&new_cpu_data); | 1061 | cpu_detect(&new_cpu_data); |
1063 | new_cpu_data.hard_math = 1; | 1062 | new_cpu_data.hard_math = 1; |
1063 | new_cpu_data.wp_works_ok = 1; | ||
1064 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1064 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
1065 | #endif | 1065 | #endif |
1066 | 1066 | ||