aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/compressed/string.c4
-rw-r--r--arch/x86/boot/string.c9
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/include/asm/atomic.h7
-rw-r--r--arch/x86/include/asm/barrier.h4
-rw-r--r--arch/x86/include/asm/bitops.h6
-rw-r--r--arch/x86/include/asm/cmdline.h6
-rw-r--r--arch/x86/include/asm/hugetlb.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/microcode.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/sync_bitops.h2
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/uprobes.h16
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/interface.h3
-rw-r--r--arch/x86/kernel/aperture_64.c59
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c2
-rw-r--r--arch/x86/kernel/apm_32.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c37
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c22
-rw-r--r--arch/x86/kernel/cpu/rdrand.c1
-rw-r--r--arch/x86/kernel/entry_64.S185
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/uprobes.c551
-rw-r--r--arch/x86/kvm/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c6
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/cmdline.c84
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/pgtable.c21
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/pci/acpi.c6
-rw-r--r--arch/x86/pci/amd_bus.c83
-rw-r--r--arch/x86/pci/broadcom_bus.c4
-rw-r--r--arch/x86/pci/fixup.c18
-rw-r--r--arch/x86/pci/i386.c27
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/vdso/vdso32-setup.c8
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/mmu.c125
-rw-r--r--arch/x86/xen/p2m.c174
-rw-r--r--arch/x86/xen/setup.c15
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-ops.h2
51 files changed, 1073 insertions, 522 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f1304d38aa21..7d5feb5908dd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -261,6 +261,9 @@ config ARCH_HWEIGHT_CFLAGS
261config ARCH_SUPPORTS_UPROBES 261config ARCH_SUPPORTS_UPROBES
262 def_bool y 262 def_bool y
263 263
264config FIX_EARLYCON_MEM
265 def_bool y
266
264source "init/Kconfig" 267source "init/Kconfig"
265source "kernel/Kconfig.freezer" 268source "kernel/Kconfig.freezer"
266 269
@@ -415,7 +418,6 @@ config X86_UV
415 418
416config X86_GOLDFISH 419config X86_GOLDFISH
417 bool "Goldfish (Virtual Platform)" 420 bool "Goldfish (Virtual Platform)"
418 depends on X86_32
419 depends on X86_EXTENDED_PLATFORM 421 depends on X86_EXTENDED_PLATFORM
420 ---help--- 422 ---help---
421 Enable support for the Goldfish virtual platform used primarily 423 Enable support for the Goldfish virtual platform used primarily
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index f3c57e341402..00e788be1db9 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,9 +1,5 @@
1#include "misc.h"
2#include "../string.c" 1#include "../string.c"
3 2
4/* misc.h might pull in string_32.h which has a macro for memcpy. undef that */
5#undef memcpy
6
7#ifdef CONFIG_X86_32 3#ifdef CONFIG_X86_32
8void *memcpy(void *dest, const void *src, size_t n) 4void *memcpy(void *dest, const void *src, size_t n)
9{ 5{
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5339040ef86e..493f3fd9f139 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -12,14 +12,9 @@
12 * Very basic string functions 12 * Very basic string functions
13 */ 13 */
14 14
15#include "boot.h" 15#include <linux/types.h>
16#include "ctype.h"
16 17
17/*
18 * This file gets included in compressed/string.c which might pull in
19 * string_32.h and which in turn maps memcmp to __builtin_memcmp(). Undo
20 * that first.
21 */
22#undef memcmp
23int memcmp(const void *s1, const void *s2, size_t len) 18int memcmp(const void *s1, const void *s2, size_t len)
24{ 19{
25 u8 diff; 20 u8 diff;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 619e7f7426c6..32d2e7056c87 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -244,7 +244,6 @@ CONFIG_HID_TOPSEED=y
244CONFIG_HID_PID=y 244CONFIG_HID_PID=y
245CONFIG_USB_HIDDEV=y 245CONFIG_USB_HIDDEV=y
246CONFIG_USB=y 246CONFIG_USB=y
247CONFIG_USB_DEBUG=y
248CONFIG_USB_ANNOUNCE_NEW_DEVICES=y 247CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
249CONFIG_USB_MON=y 248CONFIG_USB_MON=y
250CONFIG_USB_EHCI_HCD=y 249CONFIG_USB_EHCI_HCD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6181c69b786b..a481dd4755d5 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -239,7 +239,6 @@ CONFIG_HID_TOPSEED=y
239CONFIG_HID_PID=y 239CONFIG_HID_PID=y
240CONFIG_USB_HIDDEV=y 240CONFIG_USB_HIDDEV=y
241CONFIG_USB=y 241CONFIG_USB=y
242CONFIG_USB_DEBUG=y
243CONFIG_USB_ANNOUNCE_NEW_DEVICES=y 242CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
244CONFIG_USB_MON=y 243CONFIG_USB_MON=y
245CONFIG_USB_EHCI_HCD=y 244CONFIG_USB_EHCI_HCD=y
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index b17f4f48ecd7..6dd1c7dd0473 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -7,6 +7,7 @@
7#include <asm/alternative.h> 7#include <asm/alternative.h>
8#include <asm/cmpxchg.h> 8#include <asm/cmpxchg.h>
9#include <asm/rmwcc.h> 9#include <asm/rmwcc.h>
10#include <asm/barrier.h>
10 11
11/* 12/*
12 * Atomic operations that C can't guarantee us. Useful for 13 * Atomic operations that C can't guarantee us. Useful for
@@ -243,12 +244,6 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
243 : : "r" ((unsigned)(mask)), "m" (*(addr)) \ 244 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
244 : "memory") 245 : "memory")
245 246
246/* Atomic operations are already serializing on x86 */
247#define smp_mb__before_atomic_dec() barrier()
248#define smp_mb__after_atomic_dec() barrier()
249#define smp_mb__before_atomic_inc() barrier()
250#define smp_mb__after_atomic_inc() barrier()
251
252#ifdef CONFIG_X86_32 247#ifdef CONFIG_X86_32
253# include <asm/atomic64_32.h> 248# include <asm/atomic64_32.h>
254#else 249#else
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 69bbb4845020..5c7198cca5ed 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -137,6 +137,10 @@ do { \
137 137
138#endif 138#endif
139 139
140/* Atomic operations are already serializing on x86 */
141#define smp_mb__before_atomic() barrier()
142#define smp_mb__after_atomic() barrier()
143
140/* 144/*
141 * Stop RDTSC speculation. This is needed when you need to use RDTSC 145 * Stop RDTSC speculation. This is needed when you need to use RDTSC
142 * (or get_cycles or vread that possibly accesses the TSC) in a defined 146 * (or get_cycles or vread that possibly accesses the TSC) in a defined
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9fc1af74dc83..afcd35d331de 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -15,6 +15,7 @@
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <asm/alternative.h> 16#include <asm/alternative.h>
17#include <asm/rmwcc.h> 17#include <asm/rmwcc.h>
18#include <asm/barrier.h>
18 19
19#if BITS_PER_LONG == 32 20#if BITS_PER_LONG == 32
20# define _BITOPS_LONG_SHIFT 5 21# define _BITOPS_LONG_SHIFT 5
@@ -102,7 +103,7 @@ static inline void __set_bit(long nr, volatile unsigned long *addr)
102 * 103 *
103 * clear_bit() is atomic and may not be reordered. However, it does 104 * clear_bit() is atomic and may not be reordered. However, it does
104 * not contain a memory barrier, so if it is used for locking purposes, 105 * not contain a memory barrier, so if it is used for locking purposes,
105 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() 106 * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
106 * in order to ensure changes are visible on other processors. 107 * in order to ensure changes are visible on other processors.
107 */ 108 */
108static __always_inline void 109static __always_inline void
@@ -156,9 +157,6 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
156 __clear_bit(nr, addr); 157 __clear_bit(nr, addr);
157} 158}
158 159
159#define smp_mb__before_clear_bit() barrier()
160#define smp_mb__after_clear_bit() barrier()
161
162/** 160/**
163 * __change_bit - Toggle a bit in memory 161 * __change_bit - Toggle a bit in memory
164 * @nr: the bit to change 162 * @nr: the bit to change
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
new file mode 100644
index 000000000000..e01f7f7ccb0c
--- /dev/null
+++ b/arch/x86/include/asm/cmdline.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_CMDLINE_H
2#define _ASM_X86_CMDLINE_H
3
4int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
5
6#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a8091216963b..68c05398bba9 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -52,6 +52,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
52static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, 52static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
53 unsigned long addr, pte_t *ptep) 53 unsigned long addr, pte_t *ptep)
54{ 54{
55 ptep_clear_flush(vma, addr, ptep);
55} 56}
56 57
57static inline int huge_pte_none(pte_t pte) 58static inline int huge_pte_none(pte_t pte)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a307b7530e54..4615906d83df 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -190,8 +190,8 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
190#define trace_interrupt interrupt 190#define trace_interrupt interrupt
191#endif 191#endif
192 192
193#define VECTOR_UNDEFINED -1 193#define VECTOR_UNDEFINED (-1)
194#define VECTOR_RETRIGGERED -2 194#define VECTOR_RETRIGGERED (-2)
195 195
196typedef int vector_irq_t[NR_VECTORS]; 196typedef int vector_irq_t[NR_VECTORS];
197DECLARE_PER_CPU(vector_irq_t, vector_irq); 197DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index b59827e76529..64dc362506b7 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -25,6 +25,7 @@ struct cpu_signature {
25struct device; 25struct device;
26 26
27enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; 27enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
28extern bool dis_ucode_ldr;
28 29
29struct microcode_ops { 30struct microcode_ops {
30 enum ucode_state (*request_microcode_user) (int cpu, 31 enum ucode_state (*request_microcode_user) (int cpu,
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8de6d9cf3b95..678205195ae1 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_X86_PAGE_64_DEFS_H 1#ifndef _ASM_X86_PAGE_64_DEFS_H
2#define _ASM_X86_PAGE_64_DEFS_H 2#define _ASM_X86_PAGE_64_DEFS_H
3 3
4#define THREAD_SIZE_ORDER 1 4#define THREAD_SIZE_ORDER 2
5#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) 5#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
6#define CURRENT_MASK (~(THREAD_SIZE - 1)) 6#define CURRENT_MASK (~(THREAD_SIZE - 1))
7 7
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 96ae4f4040bb..0892ea0e683f 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -68,7 +68,6 @@ void pcibios_config_init(void);
68void pcibios_scan_root(int bus); 68void pcibios_scan_root(int bus);
69 69
70void pcibios_set_master(struct pci_dev *dev); 70void pcibios_set_master(struct pci_dev *dev);
71void pcibios_penalize_isa_irq(int irq, int active);
72struct irq_routing_table *pcibios_get_irq_routing_table(void); 71struct irq_routing_table *pcibios_get_irq_routing_table(void);
73int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); 72int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
74 73
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
index 05af3b31d522..f28a24b51dc7 100644
--- a/arch/x86/include/asm/sync_bitops.h
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -41,7 +41,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr)
41 * 41 *
42 * sync_clear_bit() is atomic and may not be reordered. However, it does 42 * sync_clear_bit() is atomic and may not be reordered. However, it does
43 * not contain a memory barrier, so if it is used for locking purposes, 43 * not contain a memory barrier, so if it is used for locking purposes,
44 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() 44 * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
45 * in order to ensure changes are visible on other processors. 45 * in order to ensure changes are visible on other processors.
46 */ 46 */
47static inline void sync_clear_bit(long nr, volatile unsigned long *addr) 47static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 47e5de25ba79..854053889d4d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_FORK 18 /* ret_from_fork */ 83#define TIF_FORK 18 /* ret_from_fork */
84#define TIF_NOHZ 19 /* in adaptive nohz mode */ 84#define TIF_NOHZ 19 /* in adaptive nohz mode */
85#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 85#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
86#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
86#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 87#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
87#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 88#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
88#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ 89#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
@@ -106,6 +107,7 @@ struct thread_info {
106#define _TIF_IA32 (1 << TIF_IA32) 107#define _TIF_IA32 (1 << TIF_IA32)
107#define _TIF_FORK (1 << TIF_FORK) 108#define _TIF_FORK (1 << TIF_FORK)
108#define _TIF_NOHZ (1 << TIF_NOHZ) 109#define _TIF_NOHZ (1 << TIF_NOHZ)
110#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
109#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 111#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
110#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 112#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
111#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) 113#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
@@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void)
191 * have to worry about atomic accesses. 193 * have to worry about atomic accesses.
192 */ 194 */
193#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ 195#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
194#define TS_POLLING 0x0004 /* idle task polling need_resched,
195 skip sending interrupt */
196#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 196#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
197 197
198#ifndef __ASSEMBLY__ 198#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 3087ea9c5f2e..93bee7b93854 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -33,15 +33,27 @@ typedef u8 uprobe_opcode_t;
33#define UPROBE_SWBP_INSN 0xcc 33#define UPROBE_SWBP_INSN 0xcc
34#define UPROBE_SWBP_INSN_SIZE 1 34#define UPROBE_SWBP_INSN_SIZE 1
35 35
36struct uprobe_xol_ops;
37
36struct arch_uprobe { 38struct arch_uprobe {
37 u16 fixups;
38 union { 39 union {
39 u8 insn[MAX_UINSN_BYTES]; 40 u8 insn[MAX_UINSN_BYTES];
40 u8 ixol[MAX_UINSN_BYTES]; 41 u8 ixol[MAX_UINSN_BYTES];
41 }; 42 };
43
44 u16 fixups;
45 const struct uprobe_xol_ops *ops;
46
47 union {
42#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
43 unsigned long rip_rela_target_address; 49 unsigned long rip_rela_target_address;
44#endif 50#endif
51 struct {
52 s32 offs;
53 u8 ilen;
54 u8 opc1;
55 } branch;
56 };
45}; 57};
46 58
47struct arch_uprobe_task { 59struct arch_uprobe_task {
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index e709884d0ef9..ca08a27b90b3 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -343,7 +343,7 @@ HYPERVISOR_memory_op(unsigned int cmd, void *arg)
343} 343}
344 344
345static inline int 345static inline int
346HYPERVISOR_multicall(void *call_list, int nr_calls) 346HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
347{ 347{
348 return _hypercall2(int, multicall, call_list, nr_calls); 348 return _hypercall2(int, multicall, call_list, nr_calls);
349} 349}
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index fd9cb7695b5f..3400dbaec3c3 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -54,6 +54,9 @@ typedef unsigned long xen_pfn_t;
54#define PRI_xen_pfn "lx" 54#define PRI_xen_pfn "lx"
55typedef unsigned long xen_ulong_t; 55typedef unsigned long xen_ulong_t;
56#define PRI_xen_ulong "lx" 56#define PRI_xen_ulong "lx"
57typedef long xen_long_t;
58#define PRI_xen_long "lx"
59
57/* Guest handles for primitive C types. */ 60/* Guest handles for primitive C types. */
58__DEFINE_GUEST_HANDLE(uchar, unsigned char); 61__DEFINE_GUEST_HANDLE(uchar, unsigned char);
59__DEFINE_GUEST_HANDLE(uint, unsigned int); 62__DEFINE_GUEST_HANDLE(uint, unsigned int);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9fa8aa051f54..76164e173a24 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -10,6 +10,8 @@
10 * 10 *
11 * Copyright 2002 Andi Kleen, SuSE Labs. 11 * Copyright 2002 Andi Kleen, SuSE Labs.
12 */ 12 */
13#define pr_fmt(fmt) "AGP: " fmt
14
13#include <linux/kernel.h> 15#include <linux/kernel.h>
14#include <linux/types.h> 16#include <linux/types.h>
15#include <linux/init.h> 17#include <linux/init.h>
@@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void)
75 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, 77 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
76 aper_size, aper_size); 78 aper_size, aper_size);
77 if (!addr) { 79 if (!addr) {
78 printk(KERN_ERR 80 pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
79 "Cannot allocate aperture memory hole (%lx,%uK)\n", 81 addr, addr + aper_size - 1, aper_size >> 10);
80 addr, aper_size>>10);
81 return 0; 82 return 0;
82 } 83 }
83 memblock_reserve(addr, aper_size); 84 memblock_reserve(addr, aper_size);
84 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 85 pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
85 aper_size >> 10, addr); 86 addr, addr + aper_size - 1, aper_size >> 10);
86 register_nosave_region(addr >> PAGE_SHIFT, 87 register_nosave_region(addr >> PAGE_SHIFT,
87 (addr+aper_size) >> PAGE_SHIFT); 88 (addr+aper_size) >> PAGE_SHIFT);
88 89
@@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
126 u64 aper; 127 u64 aper;
127 u32 old_order; 128 u32 old_order;
128 129
129 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); 130 pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
130 apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); 131 apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
131 if (apsizereg == 0xffffffff) { 132 if (apsizereg == 0xffffffff) {
132 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); 133 pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
134 bus, slot, func);
133 return 0; 135 return 0;
134 } 136 }
135 137
@@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
153 * On some sick chips, APSIZE is 0. It means it wants 4G 155 * On some sick chips, APSIZE is 0. It means it wants 4G
154 * so let double check that order, and lets trust AMD NB settings: 156 * so let double check that order, and lets trust AMD NB settings:
155 */ 157 */
156 printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", 158 pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
157 aper, 32 << old_order); 159 bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
160 32 << old_order);
158 if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { 161 if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
159 printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", 162 pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
160 32 << *order, apsizereg); 163 bus, slot, func, 32 << *order, apsizereg);
161 *order = old_order; 164 *order = old_order;
162 } 165 }
163 166
164 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 167 pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
165 aper, 32 << *order, apsizereg); 168 bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
169 32 << *order, apsizereg);
166 170
167 if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) 171 if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
168 return 0; 172 return 0;
@@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
218 } 222 }
219 } 223 }
220 } 224 }
221 printk(KERN_INFO "No AGP bridge found\n"); 225 pr_info("No AGP bridge found\n");
222 226
223 return 0; 227 return 0;
224} 228}
@@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void)
310 if (e820_any_mapped(aper_base, aper_base + aper_size, 314 if (e820_any_mapped(aper_base, aper_base + aper_size,
311 E820_RAM)) { 315 E820_RAM)) {
312 /* reserve it, so we can reuse it in second kernel */ 316 /* reserve it, so we can reuse it in second kernel */
313 printk(KERN_INFO "update e820 for GART\n"); 317 pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
318 aper_base, aper_base + aper_size - 1);
314 e820_add_region(aper_base, aper_size, E820_RESERVED); 319 e820_add_region(aper_base, aper_size, E820_RESERVED);
315 update_e820(); 320 update_e820();
316 } 321 }
@@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void)
354 !early_pci_allowed()) 359 !early_pci_allowed())
355 return -ENODEV; 360 return -ENODEV;
356 361
357 printk(KERN_INFO "Checking aperture...\n"); 362 pr_info("Checking aperture...\n");
358 363
359 if (!fallback_aper_force) 364 if (!fallback_aper_force)
360 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); 365 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
@@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void)
395 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 400 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
396 aper_base <<= 25; 401 aper_base <<= 25;
397 402
398 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", 403 pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
399 node, aper_base, aper_size >> 20); 404 node, aper_base, aper_base + aper_size - 1,
405 aper_size >> 20);
400 node++; 406 node++;
401 407
402 if (!aperture_valid(aper_base, aper_size, 64<<20)) { 408 if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void)
407 if (!no_iommu && 413 if (!no_iommu &&
408 max_pfn > MAX_DMA32_PFN && 414 max_pfn > MAX_DMA32_PFN &&
409 !printed_gart_size_msg) { 415 !printed_gart_size_msg) {
410 printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); 416 pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
411 printk(KERN_ERR "please increase GART size in your BIOS setup\n"); 417 pr_err("please increase GART size in your BIOS setup\n");
412 printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); 418 pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
413 printed_gart_size_msg = 1; 419 printed_gart_size_msg = 1;
414 } 420 }
415 } else { 421 } else {
@@ -446,13 +452,10 @@ out:
446 force_iommu || 452 force_iommu ||
447 valid_agp || 453 valid_agp ||
448 fallback_aper_force) { 454 fallback_aper_force) {
449 printk(KERN_INFO 455 pr_info("Your BIOS doesn't leave a aperture memory hole\n");
450 "Your BIOS doesn't leave a aperture memory hole\n"); 456 pr_info("Please enable the IOMMU option in the BIOS setup\n");
451 printk(KERN_INFO 457 pr_info("This costs you %dMB of RAM\n",
452 "Please enable the IOMMU option in the BIOS setup\n"); 458 32 << fallback_aper_order);
453 printk(KERN_INFO
454 "This costs you %d MB of RAM\n",
455 32 << fallback_aper_order);
456 459
457 aper_order = fallback_aper_order; 460 aper_order = fallback_aper_order;
458 aper_alloc = allocate_aperture(); 461 aper_alloc = allocate_aperture();
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index a698d7165c96..eab67047dec3 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -57,7 +57,7 @@ void arch_trigger_all_cpu_backtrace(void)
57 } 57 }
58 58
59 clear_bit(0, &backtrace_flag); 59 clear_bit(0, &backtrace_flag);
60 smp_mb__after_clear_bit(); 60 smp_mb__after_atomic();
61} 61}
62 62
63static int __kprobes 63static int __kprobes
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab03430211d..f3a1f04ed4cb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -844,21 +844,10 @@ static int apm_do_idle(void)
844 int polling; 844 int polling;
845 int err = 0; 845 int err = 0;
846 846
847 polling = !!(current_thread_info()->status & TS_POLLING);
848 if (polling) {
849 current_thread_info()->status &= ~TS_POLLING;
850 /*
851 * TS_POLLING-cleared state must be visible before we
852 * test NEED_RESCHED:
853 */
854 smp_mb();
855 }
856 if (!need_resched()) { 847 if (!need_resched()) {
857 idled = 1; 848 idled = 1;
858 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); 849 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
859 } 850 }
860 if (polling)
861 current_thread_info()->status |= TS_POLLING;
862 851
863 if (!idled) 852 if (!idled)
864 return 0; 853 return 0;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0f..dd9d6190b08d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -97,6 +97,9 @@ MODULE_LICENSE("GPL");
97 97
98static struct microcode_ops *microcode_ops; 98static struct microcode_ops *microcode_ops;
99 99
100bool dis_ucode_ldr;
101module_param(dis_ucode_ldr, bool, 0);
102
100/* 103/*
101 * Synchronization. 104 * Synchronization.
102 * 105 *
@@ -546,6 +549,9 @@ static int __init microcode_init(void)
546 struct cpuinfo_x86 *c = &cpu_data(0); 549 struct cpuinfo_x86 *c = &cpu_data(0);
547 int error; 550 int error;
548 551
552 if (dis_ucode_ldr)
553 return 0;
554
549 if (c->x86_vendor == X86_VENDOR_INTEL) 555 if (c->x86_vendor == X86_VENDOR_INTEL)
550 microcode_ops = init_intel_microcode(); 556 microcode_ops = init_intel_microcode();
551 else if (c->x86_vendor == X86_VENDOR_AMD) 557 else if (c->x86_vendor == X86_VENDOR_AMD)
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f577..5f28a64e71ea 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -17,9 +17,11 @@
17 * 2 of the License, or (at your option) any later version. 17 * 2 of the License, or (at your option) any later version.
18 */ 18 */
19#include <linux/module.h> 19#include <linux/module.h>
20#include <asm/microcode.h>
20#include <asm/microcode_intel.h> 21#include <asm/microcode_intel.h>
21#include <asm/microcode_amd.h> 22#include <asm/microcode_amd.h>
22#include <asm/processor.h> 23#include <asm/processor.h>
24#include <asm/cmdline.h>
23 25
24#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) 26#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
25#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') 27#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
@@ -72,10 +74,33 @@ static int x86_family(void)
72 return x86; 74 return x86;
73} 75}
74 76
77static bool __init check_loader_disabled_bsp(void)
78{
79#ifdef CONFIG_X86_32
80 const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
81 const char *opt = "dis_ucode_ldr";
82 const char *option = (const char *)__pa_nodebug(opt);
83 bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
84
85#else /* CONFIG_X86_64 */
86 const char *cmdline = boot_command_line;
87 const char *option = "dis_ucode_ldr";
88 bool *res = &dis_ucode_ldr;
89#endif
90
91 if (cmdline_find_option_bool(cmdline, option))
92 *res = true;
93
94 return *res;
95}
96
75void __init load_ucode_bsp(void) 97void __init load_ucode_bsp(void)
76{ 98{
77 int vendor, x86; 99 int vendor, x86;
78 100
101 if (check_loader_disabled_bsp())
102 return;
103
79 if (!have_cpuid_p()) 104 if (!have_cpuid_p())
80 return; 105 return;
81 106
@@ -96,10 +121,22 @@ void __init load_ucode_bsp(void)
96 } 121 }
97} 122}
98 123
124static bool check_loader_disabled_ap(void)
125{
126#ifdef CONFIG_X86_32
127 return __pa_nodebug(dis_ucode_ldr);
128#else
129 return dis_ucode_ldr;
130#endif
131}
132
99void load_ucode_ap(void) 133void load_ucode_ap(void)
100{ 134{
101 int vendor, x86; 135 int vendor, x86;
102 136
137 if (check_loader_disabled_ap())
138 return;
139
103 if (!have_cpuid_p()) 140 if (!have_cpuid_p())
104 return; 141 return;
105 142
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ae407f7226c8..89f3b7c1af20 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -721,6 +721,7 @@ int perf_assign_events(struct perf_event **events, int n,
721 721
722 return sched.state.unassigned; 722 return sched.state.unassigned;
723} 723}
724EXPORT_SYMBOL_GPL(perf_assign_events);
724 725
725int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 726int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
726{ 727{
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index aa333d966886..adb02aa62af5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
169{ 169{
170 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 170 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
171 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 171 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
172 FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */
173 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ 172 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
174 EVENT_CONSTRAINT_END 173 EVENT_CONSTRAINT_END
175}; 174};
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ae96cfa5eddd..980970cb744d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status)
108 return val; 108 return val;
109} 109}
110 110
111static u64 precise_store_data_hsw(u64 status) 111static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
112{ 112{
113 union perf_mem_data_src dse; 113 union perf_mem_data_src dse;
114 u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
114 115
115 dse.val = 0; 116 dse.val = 0;
116 dse.mem_op = PERF_MEM_OP_STORE; 117 dse.mem_op = PERF_MEM_OP_STORE;
117 dse.mem_lvl = PERF_MEM_LVL_NA; 118 dse.mem_lvl = PERF_MEM_LVL_NA;
119
120 /*
121 * L1 info only valid for following events:
122 *
123 * MEM_UOPS_RETIRED.STLB_MISS_STORES
124 * MEM_UOPS_RETIRED.LOCK_STORES
125 * MEM_UOPS_RETIRED.SPLIT_STORES
126 * MEM_UOPS_RETIRED.ALL_STORES
127 */
128 if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
129 return dse.mem_lvl;
130
118 if (status & 1) 131 if (status & 1)
119 dse.mem_lvl = PERF_MEM_LVL_L1; 132 dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
133 else
134 dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
135
120 /* Nothing else supported. Sorry. */ 136 /* Nothing else supported. Sorry. */
121 return dse.val; 137 return dse.val;
122} 138}
@@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
887 data.data_src.val = load_latency_data(pebs->dse); 903 data.data_src.val = load_latency_data(pebs->dse);
888 else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) 904 else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
889 data.data_src.val = 905 data.data_src.val =
890 precise_store_data_hsw(pebs->dse); 906 precise_store_data_hsw(event, pebs->dse);
891 else 907 else
892 data.data_src.val = precise_store_data(pebs->dse); 908 data.data_src.val = precise_store_data(pebs->dse);
893 } 909 }
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 384df5105fbc..136ac74dee82 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -27,6 +27,7 @@
27static int __init x86_rdrand_setup(char *s) 27static int __init x86_rdrand_setup(char *s)
28{ 28{
29 setup_clear_cpu_cap(X86_FEATURE_RDRAND); 29 setup_clear_cpu_cap(X86_FEATURE_RDRAND);
30 setup_clear_cpu_cap(X86_FEATURE_RDSEED);
30 return 1; 31 return 1;
31} 32}
32__setup("nordrand", x86_rdrand_setup); 33__setup("nordrand", x86_rdrand_setup);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..be846d2468f7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -36,7 +36,7 @@
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack 36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL 37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. 38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - errorentry/paranoidentry/zeroentry - Define exception entry points. 39 * - idtentry - Define exception entry points.
40 */ 40 */
41 41
42#include <linux/linkage.h> 42#include <linux/linkage.h>
@@ -1203,125 +1203,100 @@ apicinterrupt IRQ_WORK_VECTOR \
1203/* 1203/*
1204 * Exception entry points. 1204 * Exception entry points.
1205 */ 1205 */
1206.macro zeroentry sym do_sym 1206#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1207ENTRY(\sym)
1208 INTR_FRAME
1209 ASM_CLAC
1210 PARAVIRT_ADJUST_EXCEPTION_FRAME
1211 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1212 subq $ORIG_RAX-R15, %rsp
1213 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1214 call error_entry
1215 DEFAULT_FRAME 0
1216 movq %rsp,%rdi /* pt_regs pointer */
1217 xorl %esi,%esi /* no error code */
1218 call \do_sym
1219 jmp error_exit /* %ebx: no swapgs flag */
1220 CFI_ENDPROC
1221END(\sym)
1222.endm
1223 1207
1224.macro paranoidzeroentry sym do_sym 1208.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
1225ENTRY(\sym) 1209ENTRY(\sym)
1226 INTR_FRAME 1210 /* Sanity check */
1227 ASM_CLAC 1211 .if \shift_ist != -1 && \paranoid == 0
1228 PARAVIRT_ADJUST_EXCEPTION_FRAME 1212 .error "using shift_ist requires paranoid=1"
1229 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1213 .endif
1230 subq $ORIG_RAX-R15, %rsp
1231 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1232 call save_paranoid
1233 TRACE_IRQS_OFF
1234 movq %rsp,%rdi /* pt_regs pointer */
1235 xorl %esi,%esi /* no error code */
1236 call \do_sym
1237 jmp paranoid_exit /* %ebx: no swapgs flag */
1238 CFI_ENDPROC
1239END(\sym)
1240.endm
1241 1214
1242#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 1215 .if \has_error_code
1243.macro paranoidzeroentry_ist sym do_sym ist 1216 XCPT_FRAME
1244ENTRY(\sym) 1217 .else
1245 INTR_FRAME 1218 INTR_FRAME
1246 ASM_CLAC 1219 .endif
1247 PARAVIRT_ADJUST_EXCEPTION_FRAME
1248 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1249 subq $ORIG_RAX-R15, %rsp
1250 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1251 call save_paranoid
1252 TRACE_IRQS_OFF_DEBUG
1253 movq %rsp,%rdi /* pt_regs pointer */
1254 xorl %esi,%esi /* no error code */
1255 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1256 call \do_sym
1257 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1258 jmp paranoid_exit /* %ebx: no swapgs flag */
1259 CFI_ENDPROC
1260END(\sym)
1261.endm
1262 1220
1263.macro errorentry sym do_sym
1264ENTRY(\sym)
1265 XCPT_FRAME
1266 ASM_CLAC 1221 ASM_CLAC
1267 PARAVIRT_ADJUST_EXCEPTION_FRAME 1222 PARAVIRT_ADJUST_EXCEPTION_FRAME
1223
1224 .ifeq \has_error_code
1225 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1226 .endif
1227
1268 subq $ORIG_RAX-R15, %rsp 1228 subq $ORIG_RAX-R15, %rsp
1269 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1229 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1230
1231 .if \paranoid
1232 call save_paranoid
1233 .else
1270 call error_entry 1234 call error_entry
1235 .endif
1236
1271 DEFAULT_FRAME 0 1237 DEFAULT_FRAME 0
1238
1239 .if \paranoid
1240 .if \shift_ist != -1
1241 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
1242 .else
1243 TRACE_IRQS_OFF
1244 .endif
1245 .endif
1246
1272 movq %rsp,%rdi /* pt_regs pointer */ 1247 movq %rsp,%rdi /* pt_regs pointer */
1248
1249 .if \has_error_code
1273 movq ORIG_RAX(%rsp),%rsi /* get error code */ 1250 movq ORIG_RAX(%rsp),%rsi /* get error code */
1274 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ 1251 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1252 .else
1253 xorl %esi,%esi /* no error code */
1254 .endif
1255
1256 .if \shift_ist != -1
1257 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
1258 .endif
1259
1275 call \do_sym 1260 call \do_sym
1261
1262 .if \shift_ist != -1
1263 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
1264 .endif
1265
1266 .if \paranoid
1267 jmp paranoid_exit /* %ebx: no swapgs flag */
1268 .else
1276 jmp error_exit /* %ebx: no swapgs flag */ 1269 jmp error_exit /* %ebx: no swapgs flag */
1270 .endif
1271
1277 CFI_ENDPROC 1272 CFI_ENDPROC
1278END(\sym) 1273END(\sym)
1279.endm 1274.endm
1280 1275
1281#ifdef CONFIG_TRACING 1276#ifdef CONFIG_TRACING
1282.macro trace_errorentry sym do_sym 1277.macro trace_idtentry sym do_sym has_error_code:req
1283errorentry trace(\sym) trace(\do_sym) 1278idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
1284errorentry \sym \do_sym 1279idtentry \sym \do_sym has_error_code=\has_error_code
1285.endm 1280.endm
1286#else 1281#else
1287.macro trace_errorentry sym do_sym 1282.macro trace_idtentry sym do_sym has_error_code:req
1288errorentry \sym \do_sym 1283idtentry \sym \do_sym has_error_code=\has_error_code
1289.endm 1284.endm
1290#endif 1285#endif
1291 1286
1292 /* error code is on the stack already */ 1287idtentry divide_error do_divide_error has_error_code=0
1293.macro paranoiderrorentry sym do_sym 1288idtentry overflow do_overflow has_error_code=0
1294ENTRY(\sym) 1289idtentry bounds do_bounds has_error_code=0
1295 XCPT_FRAME 1290idtentry invalid_op do_invalid_op has_error_code=0
1296 ASM_CLAC 1291idtentry device_not_available do_device_not_available has_error_code=0
1297 PARAVIRT_ADJUST_EXCEPTION_FRAME 1292idtentry double_fault do_double_fault has_error_code=1 paranoid=1
1298 subq $ORIG_RAX-R15, %rsp 1293idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1299 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1294idtentry invalid_TSS do_invalid_TSS has_error_code=1
1300 call save_paranoid 1295idtentry segment_not_present do_segment_not_present has_error_code=1
1301 DEFAULT_FRAME 0 1296idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
1302 TRACE_IRQS_OFF 1297idtentry coprocessor_error do_coprocessor_error has_error_code=0
1303 movq %rsp,%rdi /* pt_regs pointer */ 1298idtentry alignment_check do_alignment_check has_error_code=1
1304 movq ORIG_RAX(%rsp),%rsi /* get error code */ 1299idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
1305 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1306 call \do_sym
1307 jmp paranoid_exit /* %ebx: no swapgs flag */
1308 CFI_ENDPROC
1309END(\sym)
1310.endm
1311
1312zeroentry divide_error do_divide_error
1313zeroentry overflow do_overflow
1314zeroentry bounds do_bounds
1315zeroentry invalid_op do_invalid_op
1316zeroentry device_not_available do_device_not_available
1317paranoiderrorentry double_fault do_double_fault
1318zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1319errorentry invalid_TSS do_invalid_TSS
1320errorentry segment_not_present do_segment_not_present
1321zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1322zeroentry coprocessor_error do_coprocessor_error
1323errorentry alignment_check do_alignment_check
1324zeroentry simd_coprocessor_error do_simd_coprocessor_error
1325 1300
1326 1301
1327 /* Reload gs selector with exception handling */ 1302 /* Reload gs selector with exception handling */
@@ -1371,7 +1346,7 @@ ENTRY(do_softirq_own_stack)
1371END(do_softirq_own_stack) 1346END(do_softirq_own_stack)
1372 1347
1373#ifdef CONFIG_XEN 1348#ifdef CONFIG_XEN
1374zeroentry xen_hypervisor_callback xen_do_hypervisor_callback 1349idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
1375 1350
1376/* 1351/*
1377 * A note on the "critical region" in our callback handler. 1352 * A note on the "critical region" in our callback handler.
@@ -1482,21 +1457,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
1482 */ 1457 */
1483 .pushsection .kprobes.text, "ax" 1458 .pushsection .kprobes.text, "ax"
1484 1459
1485paranoidzeroentry_ist debug do_debug DEBUG_STACK 1460idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1486paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1461idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1487paranoiderrorentry stack_segment do_stack_segment 1462idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
1488#ifdef CONFIG_XEN 1463#ifdef CONFIG_XEN
1489zeroentry xen_debug do_debug 1464idtentry xen_debug do_debug has_error_code=0
1490zeroentry xen_int3 do_int3 1465idtentry xen_int3 do_int3 has_error_code=0
1491errorentry xen_stack_segment do_stack_segment 1466idtentry xen_stack_segment do_stack_segment has_error_code=1
1492#endif 1467#endif
1493errorentry general_protection do_general_protection 1468idtentry general_protection do_general_protection has_error_code=1
1494trace_errorentry page_fault do_page_fault 1469trace_idtentry page_fault do_page_fault has_error_code=1
1495#ifdef CONFIG_KVM_GUEST 1470#ifdef CONFIG_KVM_GUEST
1496errorentry async_page_fault do_async_page_fault 1471idtentry async_page_fault do_async_page_fault has_error_code=1
1497#endif 1472#endif
1498#ifdef CONFIG_X86_MCE 1473#ifdef CONFIG_X86_MCE
1499paranoidzeroentry machine_check *machine_check_vector(%rip) 1474idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
1500#endif 1475#endif
1501 1476
1502 /* 1477 /*
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index af1d14a9ebda..dcbbaa165bde 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,6 +20,8 @@
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h> 21#include <asm/syscalls.h>
22 22
23int sysctl_ldt16 = 0;
24
23#ifdef CONFIG_SMP 25#ifdef CONFIG_SMP
24static void flush_ldt(void *current_mm) 26static void flush_ldt(void *current_mm)
25{ 27{
@@ -234,7 +236,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
234 * IRET leaking the high bits of the kernel stack address. 236 * IRET leaking the high bits of the kernel stack address.
235 */ 237 */
236#ifdef CONFIG_X86_64 238#ifdef CONFIG_X86_64
237 if (!ldt_info.seg_32bit) { 239 if (!ldt_info.seg_32bit && !sysctl_ldt16) {
238 error = -EINVAL; 240 error = -EINVAL;
239 goto out_unlock; 241 goto out_unlock;
240 } 242 }
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2ed845928b5f..ace22916ade3 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -53,7 +53,7 @@
53#define OPCODE1(insn) ((insn)->opcode.bytes[0]) 53#define OPCODE1(insn) ((insn)->opcode.bytes[0])
54#define OPCODE2(insn) ((insn)->opcode.bytes[1]) 54#define OPCODE2(insn) ((insn)->opcode.bytes[1])
55#define OPCODE3(insn) ((insn)->opcode.bytes[2]) 55#define OPCODE3(insn) ((insn)->opcode.bytes[2])
56#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) 56#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
57 57
58#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 58#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
59 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 59 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
229 return -ENOTSUPP; 229 return -ENOTSUPP;
230} 230}
231 231
232/*
233 * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
234 * annotate arch_uprobe->fixups accordingly. To start with,
235 * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
236 */
237static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
238{
239 bool fix_ip = true, fix_call = false; /* defaults */
240 int reg;
241
242 insn_get_opcode(insn); /* should be a nop */
243
244 switch (OPCODE1(insn)) {
245 case 0x9d:
246 /* popf */
247 auprobe->fixups |= UPROBE_FIX_SETF;
248 break;
249 case 0xc3: /* ret/lret */
250 case 0xcb:
251 case 0xc2:
252 case 0xca:
253 /* ip is correct */
254 fix_ip = false;
255 break;
256 case 0xe8: /* call relative - Fix return addr */
257 fix_call = true;
258 break;
259 case 0x9a: /* call absolute - Fix return addr, not ip */
260 fix_call = true;
261 fix_ip = false;
262 break;
263 case 0xff:
264 insn_get_modrm(insn);
265 reg = MODRM_REG(insn);
266 if (reg == 2 || reg == 3) {
267 /* call or lcall, indirect */
268 /* Fix return addr; ip is correct. */
269 fix_call = true;
270 fix_ip = false;
271 } else if (reg == 4 || reg == 5) {
272 /* jmp or ljmp, indirect */
273 /* ip is correct. */
274 fix_ip = false;
275 }
276 break;
277 case 0xea: /* jmp absolute -- ip is correct */
278 fix_ip = false;
279 break;
280 default:
281 break;
282 }
283 if (fix_ip)
284 auprobe->fixups |= UPROBE_FIX_IP;
285 if (fix_call)
286 auprobe->fixups |= UPROBE_FIX_CALL;
287}
288
289#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
290/* 233/*
291 * If arch_uprobe->insn doesn't use rip-relative addressing, return 234 * If arch_uprobe->insn doesn't use rip-relative addressing, return
@@ -310,15 +253,11 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
310 * - The displacement is always 4 bytes. 253 * - The displacement is always 4 bytes.
311 */ 254 */
312static void 255static void
313handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) 256handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn)
314{ 257{
315 u8 *cursor; 258 u8 *cursor;
316 u8 reg; 259 u8 reg;
317 260
318 if (mm->context.ia32_compat)
319 return;
320
321 auprobe->rip_rela_target_address = 0x0;
322 if (!insn_rip_relative(insn)) 261 if (!insn_rip_relative(insn))
323 return; 262 return;
324 263
@@ -372,7 +311,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins
372 cursor++; 311 cursor++;
373 memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); 312 memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
374 } 313 }
375 return; 314}
315
316/*
317 * If we're emulating a rip-relative instruction, save the contents
318 * of the scratch register and store the target address in that register.
319 */
320static void
321pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
322 struct arch_uprobe_task *autask)
323{
324 if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
325 autask->saved_scratch_register = regs->ax;
326 regs->ax = current->utask->vaddr;
327 regs->ax += auprobe->rip_rela_target_address;
328 } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
329 autask->saved_scratch_register = regs->cx;
330 regs->cx = current->utask->vaddr;
331 regs->cx += auprobe->rip_rela_target_address;
332 }
333}
334
335static void
336handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
337{
338 if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) {
339 struct arch_uprobe_task *autask;
340
341 autask = &current->utask->autask;
342 if (auprobe->fixups & UPROBE_FIX_RIP_AX)
343 regs->ax = autask->saved_scratch_register;
344 else
345 regs->cx = autask->saved_scratch_register;
346
347 /*
348 * The original instruction includes a displacement, and so
349 * is 4 bytes longer than what we've just single-stepped.
350 * Caller may need to apply other fixups to handle stuff
351 * like "jmpq *...(%rip)" and "callq *...(%rip)".
352 */
353 if (correction)
354 *correction += 4;
355 }
376} 356}
377 357
378static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) 358static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
@@ -401,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
401 return validate_insn_64bits(auprobe, insn); 381 return validate_insn_64bits(auprobe, insn);
402} 382}
403#else /* 32-bit: */ 383#else /* 32-bit: */
404static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) 384/*
385 * No RIP-relative addressing on 32-bit
386 */
387static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn)
388{
389}
390static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
391 struct arch_uprobe_task *autask)
392{
393}
394static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs,
395 long *correction)
405{ 396{
406 /* No RIP-relative addressing on 32-bit */
407} 397}
408 398
409static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) 399static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
@@ -412,141 +402,311 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
412} 402}
413#endif /* CONFIG_X86_64 */ 403#endif /* CONFIG_X86_64 */
414 404
415/** 405struct uprobe_xol_ops {
416 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. 406 bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
417 * @mm: the probed address space. 407 int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
418 * @arch_uprobe: the probepoint information. 408 int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
419 * @addr: virtual address at which to install the probepoint 409};
420 * Return 0 on success or a -ve number on error. 410
411static inline int sizeof_long(void)
412{
413 return is_ia32_task() ? 4 : 8;
414}
415
416static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
417{
418 pre_xol_rip_insn(auprobe, regs, &current->utask->autask);
419 return 0;
420}
421
422/*
423 * Adjust the return address pushed by a call insn executed out of line.
421 */ 424 */
422int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) 425static int adjust_ret_addr(unsigned long sp, long correction)
423{ 426{
424 int ret; 427 int rasize = sizeof_long();
425 struct insn insn; 428 long ra;
426 429
427 auprobe->fixups = 0; 430 if (copy_from_user(&ra, (void __user *)sp, rasize))
428 ret = validate_insn_bits(auprobe, mm, &insn); 431 return -EFAULT;
429 if (ret != 0)
430 return ret;
431 432
432 handle_riprel_insn(auprobe, mm, &insn); 433 ra += correction;
433 prepare_fixups(auprobe, &insn); 434 if (copy_to_user((void __user *)sp, &ra, rasize))
435 return -EFAULT;
434 436
435 return 0; 437 return 0;
436} 438}
437 439
438#ifdef CONFIG_X86_64 440static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
439/*
440 * If we're emulating a rip-relative instruction, save the contents
441 * of the scratch register and store the target address in that register.
442 */
443static void
444pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
445 struct arch_uprobe_task *autask)
446{ 441{
447 if (auprobe->fixups & UPROBE_FIX_RIP_AX) { 442 struct uprobe_task *utask = current->utask;
448 autask->saved_scratch_register = regs->ax; 443 long correction = (long)(utask->vaddr - utask->xol_vaddr);
449 regs->ax = current->utask->vaddr; 444
450 regs->ax += auprobe->rip_rela_target_address; 445 handle_riprel_post_xol(auprobe, regs, &correction);
451 } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { 446 if (auprobe->fixups & UPROBE_FIX_IP)
452 autask->saved_scratch_register = regs->cx; 447 regs->ip += correction;
453 regs->cx = current->utask->vaddr; 448
454 regs->cx += auprobe->rip_rela_target_address; 449 if (auprobe->fixups & UPROBE_FIX_CALL) {
450 if (adjust_ret_addr(regs->sp, correction)) {
451 regs->sp += sizeof_long();
452 return -ERESTART;
453 }
455 } 454 }
455
456 return 0;
456} 457}
457#else 458
458static void 459static struct uprobe_xol_ops default_xol_ops = {
459pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, 460 .pre_xol = default_pre_xol_op,
460 struct arch_uprobe_task *autask) 461 .post_xol = default_post_xol_op,
462};
463
464static bool branch_is_call(struct arch_uprobe *auprobe)
461{ 465{
462 /* No RIP-relative addressing on 32-bit */ 466 return auprobe->branch.opc1 == 0xe8;
463} 467}
464#endif
465 468
466/* 469#define CASE_COND \
467 * arch_uprobe_pre_xol - prepare to execute out of line. 470 COND(70, 71, XF(OF)) \
468 * @auprobe: the probepoint information. 471 COND(72, 73, XF(CF)) \
469 * @regs: reflects the saved user state of current task. 472 COND(74, 75, XF(ZF)) \
470 */ 473 COND(78, 79, XF(SF)) \
471int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 474 COND(7a, 7b, XF(PF)) \
472{ 475 COND(76, 77, XF(CF) || XF(ZF)) \
473 struct arch_uprobe_task *autask; 476 COND(7c, 7d, XF(SF) != XF(OF)) \
477 COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
474 478
475 autask = &current->utask->autask; 479#define COND(op_y, op_n, expr) \
476 autask->saved_trap_nr = current->thread.trap_nr; 480 case 0x ## op_y: DO((expr) != 0) \
477 current->thread.trap_nr = UPROBE_TRAP_NR; 481 case 0x ## op_n: DO((expr) == 0)
478 regs->ip = current->utask->xol_vaddr;
479 pre_xol_rip_insn(auprobe, regs, autask);
480 482
481 autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); 483#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
482 regs->flags |= X86_EFLAGS_TF;
483 if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
484 set_task_blockstep(current, false);
485 484
486 return 0; 485static bool is_cond_jmp_opcode(u8 opcode)
486{
487 switch (opcode) {
488 #define DO(expr) \
489 return true;
490 CASE_COND
491 #undef DO
492
493 default:
494 return false;
495 }
487} 496}
488 497
489/* 498static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
490 * This function is called by arch_uprobe_post_xol() to adjust the return
491 * address pushed by a call instruction executed out of line.
492 */
493static int adjust_ret_addr(unsigned long sp, long correction)
494{ 499{
495 int rasize, ncopied; 500 unsigned long flags = regs->flags;
496 long ra = 0;
497 501
498 if (is_ia32_task()) 502 switch (auprobe->branch.opc1) {
499 rasize = 4; 503 #define DO(expr) \
500 else 504 return expr;
501 rasize = 8; 505 CASE_COND
506 #undef DO
502 507
503 ncopied = copy_from_user(&ra, (void __user *)sp, rasize); 508 default: /* not a conditional jmp */
504 if (unlikely(ncopied)) 509 return true;
505 return -EFAULT; 510 }
511}
506 512
507 ra += correction; 513#undef XF
508 ncopied = copy_to_user((void __user *)sp, &ra, rasize); 514#undef COND
509 if (unlikely(ncopied)) 515#undef CASE_COND
510 return -EFAULT;
511 516
512 return 0; 517static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
518{
519 unsigned long new_ip = regs->ip += auprobe->branch.ilen;
520 unsigned long offs = (long)auprobe->branch.offs;
521
522 if (branch_is_call(auprobe)) {
523 unsigned long new_sp = regs->sp - sizeof_long();
524 /*
525 * If it fails we execute this (mangled, see the comment in
526 * branch_clear_offset) insn out-of-line. In the likely case
527 * this should trigger the trap, and the probed application
528 * should die or restart the same insn after it handles the
529 * signal, arch_uprobe_post_xol() won't be even called.
530 *
531 * But there is corner case, see the comment in ->post_xol().
532 */
533 if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long()))
534 return false;
535 regs->sp = new_sp;
536 } else if (!check_jmp_cond(auprobe, regs)) {
537 offs = 0;
538 }
539
540 regs->ip = new_ip + offs;
541 return true;
513} 542}
514 543
515#ifdef CONFIG_X86_64 544static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
516static bool is_riprel_insn(struct arch_uprobe *auprobe)
517{ 545{
518 return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); 546 BUG_ON(!branch_is_call(auprobe));
547 /*
548 * We can only get here if branch_emulate_op() failed to push the ret
549 * address _and_ another thread expanded our stack before the (mangled)
550 * "call" insn was executed out-of-line. Just restore ->sp and restart.
551 * We could also restore ->ip and try to call branch_emulate_op() again.
552 */
553 regs->sp += sizeof_long();
554 return -ERESTART;
519} 555}
520 556
521static void 557static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
522handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
523{ 558{
524 if (is_riprel_insn(auprobe)) { 559 /*
525 struct arch_uprobe_task *autask; 560 * Turn this insn into "call 1f; 1:", this is what we will execute
561 * out-of-line if ->emulate() fails. We only need this to generate
562 * a trap, so that the probed task receives the correct signal with
563 * the properly filled siginfo.
564 *
565 * But see the comment in ->post_xol(), in the unlikely case it can
566 * succeed. So we need to ensure that the new ->ip can not fall into
567 * the non-canonical area and trigger #GP.
568 *
569 * We could turn it into (say) "pushf", but then we would need to
570 * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
571 * of ->insn[] for set_orig_insn().
572 */
573 memset(auprobe->insn + insn_offset_immediate(insn),
574 0, insn->immediate.nbytes);
575}
526 576
527 autask = &current->utask->autask; 577static struct uprobe_xol_ops branch_xol_ops = {
528 if (auprobe->fixups & UPROBE_FIX_RIP_AX) 578 .emulate = branch_emulate_op,
529 regs->ax = autask->saved_scratch_register; 579 .post_xol = branch_post_xol_op,
530 else 580};
531 regs->cx = autask->saved_scratch_register; 581
582/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
583static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
584{
585 u8 opc1 = OPCODE1(insn);
586
587 /* has the side-effect of processing the entire instruction */
588 insn_get_length(insn);
589 if (WARN_ON_ONCE(!insn_complete(insn)))
590 return -ENOEXEC;
591
592 switch (opc1) {
593 case 0xeb: /* jmp 8 */
594 case 0xe9: /* jmp 32 */
595 case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
596 break;
597
598 case 0xe8: /* call relative */
599 branch_clear_offset(auprobe, insn);
600 break;
532 601
602 case 0x0f:
603 if (insn->opcode.nbytes != 2)
604 return -ENOSYS;
533 /* 605 /*
534 * The original instruction includes a displacement, and so 606 * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
535 * is 4 bytes longer than what we've just single-stepped. 607 * OPCODE1() of the "short" jmp which checks the same condition.
536 * Fall through to handle stuff like "jmpq *...(%rip)" and
537 * "callq *...(%rip)".
538 */ 608 */
539 if (correction) 609 opc1 = OPCODE2(insn) - 0x10;
540 *correction += 4; 610 default:
611 if (!is_cond_jmp_opcode(opc1))
612 return -ENOSYS;
541 } 613 }
614
615 auprobe->branch.opc1 = opc1;
616 auprobe->branch.ilen = insn->length;
617 auprobe->branch.offs = insn->immediate.value;
618
619 auprobe->ops = &branch_xol_ops;
620 return 0;
542} 621}
543#else 622
544static void 623/**
545handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) 624 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
625 * @mm: the probed address space.
626 * @arch_uprobe: the probepoint information.
627 * @addr: virtual address at which to install the probepoint
628 * Return 0 on success or a -ve number on error.
629 */
630int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
631{
632 struct insn insn;
633 bool fix_ip = true, fix_call = false;
634 int ret;
635
636 ret = validate_insn_bits(auprobe, mm, &insn);
637 if (ret)
638 return ret;
639
640 ret = branch_setup_xol_ops(auprobe, &insn);
641 if (ret != -ENOSYS)
642 return ret;
643
644 /*
645 * Figure out which fixups arch_uprobe_post_xol() will need to perform,
646 * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups
647 * is either zero or it reflects rip-related fixups.
648 */
649 switch (OPCODE1(&insn)) {
650 case 0x9d: /* popf */
651 auprobe->fixups |= UPROBE_FIX_SETF;
652 break;
653 case 0xc3: /* ret or lret -- ip is correct */
654 case 0xcb:
655 case 0xc2:
656 case 0xca:
657 fix_ip = false;
658 break;
659 case 0x9a: /* call absolute - Fix return addr, not ip */
660 fix_call = true;
661 fix_ip = false;
662 break;
663 case 0xea: /* jmp absolute -- ip is correct */
664 fix_ip = false;
665 break;
666 case 0xff:
667 insn_get_modrm(&insn);
668 switch (MODRM_REG(&insn)) {
669 case 2: case 3: /* call or lcall, indirect */
670 fix_call = true;
671 case 4: case 5: /* jmp or ljmp, indirect */
672 fix_ip = false;
673 }
674 /* fall through */
675 default:
676 handle_riprel_insn(auprobe, &insn);
677 }
678
679 if (fix_ip)
680 auprobe->fixups |= UPROBE_FIX_IP;
681 if (fix_call)
682 auprobe->fixups |= UPROBE_FIX_CALL;
683
684 auprobe->ops = &default_xol_ops;
685 return 0;
686}
687
688/*
689 * arch_uprobe_pre_xol - prepare to execute out of line.
690 * @auprobe: the probepoint information.
691 * @regs: reflects the saved user state of current task.
692 */
693int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
546{ 694{
547 /* No RIP-relative addressing on 32-bit */ 695 struct uprobe_task *utask = current->utask;
696
697 regs->ip = utask->xol_vaddr;
698 utask->autask.saved_trap_nr = current->thread.trap_nr;
699 current->thread.trap_nr = UPROBE_TRAP_NR;
700
701 utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
702 regs->flags |= X86_EFLAGS_TF;
703 if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
704 set_task_blockstep(current, false);
705
706 if (auprobe->ops->pre_xol)
707 return auprobe->ops->pre_xol(auprobe, regs);
708 return 0;
548} 709}
549#endif
550 710
551/* 711/*
552 * If xol insn itself traps and generates a signal(Say, 712 * If xol insn itself traps and generates a signal(Say,
@@ -592,22 +752,25 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
592 */ 752 */
593int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 753int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
594{ 754{
595 struct uprobe_task *utask; 755 struct uprobe_task *utask = current->utask;
596 long correction;
597 int result = 0;
598 756
599 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); 757 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
600 758
601 utask = current->utask; 759 if (auprobe->ops->post_xol) {
602 current->thread.trap_nr = utask->autask.saved_trap_nr; 760 int err = auprobe->ops->post_xol(auprobe, regs);
603 correction = (long)(utask->vaddr - utask->xol_vaddr); 761 if (err) {
604 handle_riprel_post_xol(auprobe, regs, &correction); 762 arch_uprobe_abort_xol(auprobe, regs);
605 if (auprobe->fixups & UPROBE_FIX_IP) 763 /*
606 regs->ip += correction; 764 * Restart the probed insn. ->post_xol() must ensure
607 765 * this is really possible if it returns -ERESTART.
608 if (auprobe->fixups & UPROBE_FIX_CALL) 766 */
609 result = adjust_ret_addr(regs->sp, correction); 767 if (err == -ERESTART)
768 return 0;
769 return err;
770 }
771 }
610 772
773 current->thread.trap_nr = utask->autask.saved_trap_nr;
611 /* 774 /*
612 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP 775 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
613 * so we can get an extra SIGTRAP if we do not clear TF. We need 776 * so we can get an extra SIGTRAP if we do not clear TF. We need
@@ -618,7 +781,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
618 else if (!(auprobe->fixups & UPROBE_FIX_SETF)) 781 else if (!(auprobe->fixups & UPROBE_FIX_SETF))
619 regs->flags &= ~X86_EFLAGS_TF; 782 regs->flags &= ~X86_EFLAGS_TF;
620 783
621 return result; 784 return 0;
622} 785}
623 786
624/* callback routine for handling exceptions. */ 787/* callback routine for handling exceptions. */
@@ -652,8 +815,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
652 815
653/* 816/*
654 * This function gets called when XOL instruction either gets trapped or 817 * This function gets called when XOL instruction either gets trapped or
655 * the thread has a fatal signal, so reset the instruction pointer to its 818 * the thread has a fatal signal, or if arch_uprobe_post_xol() failed.
656 * probed address. 819 * Reset the instruction pointer to its probed address for the potential
820 * restart or for post mortem analysis.
657 */ 821 */
658void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 822void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
659{ 823{
@@ -668,25 +832,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
668 regs->flags &= ~X86_EFLAGS_TF; 832 regs->flags &= ~X86_EFLAGS_TF;
669} 833}
670 834
671/*
672 * Skip these instructions as per the currently known x86 ISA.
673 * rep=0x66*; nop=0x90
674 */
675static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) 835static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
676{ 836{
677 int i; 837 if (auprobe->ops->emulate)
678 838 return auprobe->ops->emulate(auprobe, regs);
679 for (i = 0; i < MAX_UINSN_BYTES; i++) {
680 if (auprobe->insn[i] == 0x66)
681 continue;
682
683 if (auprobe->insn[i] == 0x90) {
684 regs->ip += i + 1;
685 return true;
686 }
687
688 break;
689 }
690 return false; 839 return false;
691} 840}
692 841
@@ -701,23 +850,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
701unsigned long 850unsigned long
702arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) 851arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
703{ 852{
704 int rasize, ncopied; 853 int rasize = sizeof_long(), nleft;
705 unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ 854 unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
706 855
707 rasize = is_ia32_task() ? 4 : 8; 856 if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
708 ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize);
709 if (unlikely(ncopied))
710 return -1; 857 return -1;
711 858
712 /* check whether address has been already hijacked */ 859 /* check whether address has been already hijacked */
713 if (orig_ret_vaddr == trampoline_vaddr) 860 if (orig_ret_vaddr == trampoline_vaddr)
714 return orig_ret_vaddr; 861 return orig_ret_vaddr;
715 862
716 ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); 863 nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
717 if (likely(!ncopied)) 864 if (likely(!nleft))
718 return orig_ret_vaddr; 865 return orig_ret_vaddr;
719 866
720 if (ncopied != rasize) { 867 if (nleft != rasize) {
721 pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " 868 pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
722 "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); 869 "%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
723 870
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33e8c028842f..138ceffc6377 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7778,7 +7778,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7778 7778
7779 exec_control = vmcs12->pin_based_vm_exec_control; 7779 exec_control = vmcs12->pin_based_vm_exec_control;
7780 exec_control |= vmcs_config.pin_based_exec_ctrl; 7780 exec_control |= vmcs_config.pin_based_exec_ctrl;
7781 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 7781 exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
7782 PIN_BASED_POSTED_INTR);
7782 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 7783 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
7783 7784
7784 vmx->nested.preemption_timer_expired = false; 7785 vmx->nested.preemption_timer_expired = false;
@@ -7815,7 +7816,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7815 if (!vmx->rdtscp_enabled) 7816 if (!vmx->rdtscp_enabled)
7816 exec_control &= ~SECONDARY_EXEC_RDTSCP; 7817 exec_control &= ~SECONDARY_EXEC_RDTSCP;
7817 /* Take the following fields only from vmcs12 */ 7818 /* Take the following fields only from vmcs12 */
7818 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7819 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7820 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
7821 SECONDARY_EXEC_APIC_REGISTER_VIRT);
7819 if (nested_cpu_has(vmcs12, 7822 if (nested_cpu_has(vmcs12,
7820 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 7823 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
7821 exec_control |= vmcs12->secondary_vm_exec_control; 7824 exec_control |= vmcs12->secondary_vm_exec_control;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6c0bacca9bd..20316c67b824 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
106static u32 tsc_tolerance_ppm = 250; 106static u32 tsc_tolerance_ppm = 250;
107module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 107module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
108 108
109static bool backwards_tsc_observed = false;
110
109#define KVM_NR_SHARED_MSRS 16 111#define KVM_NR_SHARED_MSRS 16
110 112
111struct kvm_shared_msrs_global { 113struct kvm_shared_msrs_global {
@@ -1486,7 +1488,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1486 &ka->master_kernel_ns, 1488 &ka->master_kernel_ns,
1487 &ka->master_cycle_now); 1489 &ka->master_cycle_now);
1488 1490
1489 ka->use_master_clock = host_tsc_clocksource & vcpus_matched; 1491 ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1492 && !backwards_tsc_observed;
1490 1493
1491 if (ka->use_master_clock) 1494 if (ka->use_master_clock)
1492 atomic_set(&kvm_guest_has_master_clock, 1); 1495 atomic_set(&kvm_guest_has_master_clock, 1);
@@ -6945,6 +6948,7 @@ int kvm_arch_hardware_enable(void *garbage)
6945 */ 6948 */
6946 if (backwards_tsc) { 6949 if (backwards_tsc) {
6947 u64 delta_cyc = max_tsc - local_tsc; 6950 u64 delta_cyc = max_tsc - local_tsc;
6951 backwards_tsc_observed = true;
6948 list_for_each_entry(kvm, &vm_list, vm_list) { 6952 list_for_each_entry(kvm, &vm_list, vm_list) {
6949 kvm_for_each_vcpu(i, vcpu, kvm) { 6953 kvm_for_each_vcpu(i, vcpu, kvm) {
6950 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6954 vcpu->arch.tsc_offset_adjustment += delta_cyc;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index eabcb6e6a900..4d4f96a27638 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
16 16
17obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o 17obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
18 18
19lib-y := delay.o misc.o 19lib-y := delay.o misc.o cmdline.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
21lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
22lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 000000000000..422db000d727
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,84 @@
1/*
2 * This file is part of the Linux kernel, and is made available under
3 * the terms of the GNU General Public License version 2.
4 *
5 * Misc librarized functions for cmdline poking.
6 */
7#include <linux/kernel.h>
8#include <linux/string.h>
9#include <linux/ctype.h>
10#include <asm/setup.h>
11
12static inline int myisspace(u8 c)
13{
14 return c <= ' '; /* Close enough approximation */
15}
16
17/**
18 * Find a boolean option (like quiet,noapic,nosmp....)
19 *
20 * @cmdline: the cmdline string
21 * @option: option string to look for
22 *
23 * Returns the position of that @option (starts counting with 1)
24 * or 0 on not found.
25 */
26int cmdline_find_option_bool(const char *cmdline, const char *option)
27{
28 char c;
29 int len, pos = 0, wstart = 0;
30 const char *opptr = NULL;
31 enum {
32 st_wordstart = 0, /* Start of word/after whitespace */
33 st_wordcmp, /* Comparing this word */
34 st_wordskip, /* Miscompare, skip */
35 } state = st_wordstart;
36
37 if (!cmdline)
38 return -1; /* No command line */
39
40 len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
41 if (!len)
42 return 0;
43
44 while (len--) {
45 c = *(char *)cmdline++;
46 pos++;
47
48 switch (state) {
49 case st_wordstart:
50 if (!c)
51 return 0;
52 else if (myisspace(c))
53 break;
54
55 state = st_wordcmp;
56 opptr = option;
57 wstart = pos;
58 /* fall through */
59
60 case st_wordcmp:
61 if (!*opptr)
62 if (!c || myisspace(c))
63 return wstart;
64 else
65 state = st_wordskip;
66 else if (!c)
67 return 0;
68 else if (c != *opptr++)
69 state = st_wordskip;
70 else if (!len) /* last word and is matching */
71 return wstart;
72 break;
73
74 case st_wordskip:
75 if (!c)
76 return 0;
77 else if (myisspace(c))
78 state = st_wordstart;
79 break;
80 }
81 }
82
83 return 0; /* Buffer overrun */
84}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 597ac155c91c..bc7527e109c8 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
50 return err; 50 return err;
51} 51}
52 52
53static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
54 void *arg)
55{
56 unsigned long i;
57
58 for (i = 0; i < nr_pages; ++i)
59 if (pfn_valid(start_pfn + i) &&
60 !PageReserved(pfn_to_page(start_pfn + i)))
61 return 1;
62
63 WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
64
65 return 0;
66}
67
53/* 68/*
54 * Remap an arbitrary physical address space into the kernel virtual 69 * Remap an arbitrary physical address space into the kernel virtual
55 * address space. Needed when the kernel wants to access high addresses 70 * address space. Needed when the kernel wants to access high addresses
@@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
93 /* 108 /*
94 * Don't allow anybody to remap normal RAM that we're using.. 109 * Don't allow anybody to remap normal RAM that we're using..
95 */ 110 */
111 pfn = phys_addr >> PAGE_SHIFT;
96 last_pfn = last_addr >> PAGE_SHIFT; 112 last_pfn = last_addr >> PAGE_SHIFT;
97 for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { 113 if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
98 int is_ram = page_is_ram(pfn); 114 __ioremap_check_ram) == 1)
99 115 return NULL;
100 if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
101 return NULL;
102 WARN_ON_ONCE(is_ram);
103 }
104 116
105 /* 117 /*
106 * Mappings have to be page-aligned 118 * Mappings have to be page-aligned
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index c96314abd144..0004ac72dbdd 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
399int ptep_clear_flush_young(struct vm_area_struct *vma, 399int ptep_clear_flush_young(struct vm_area_struct *vma,
400 unsigned long address, pte_t *ptep) 400 unsigned long address, pte_t *ptep)
401{ 401{
402 int young; 402 /*
403 403 * On x86 CPUs, clearing the accessed bit without a TLB flush
404 young = ptep_test_and_clear_young(vma, address, ptep); 404 * doesn't cause data corruption. [ It could cause incorrect
405 if (young) 405 * page aging and the (mistaken) reclaim of hot pages, but the
406 flush_tlb_page(vma, address); 406 * chance of that should be relatively low. ]
407 407 *
408 return young; 408 * So as a performance optimization don't flush the TLB when
409 * clearing the accessed bit, it will eventually be flushed by
410 * a context switch or a VM operation anyway. [ In the rare
411 * event of it not getting flushed for a long time the delay
412 * shouldn't really matter because there's no real memory
413 * pressure for swapout to react to. ]
414 */
415 return ptep_test_and_clear_young(vma, address, ptep);
409} 416}
410 417
411#ifdef CONFIG_TRANSPARENT_HUGEPAGE 418#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index dc017735bb91..6d5663a599a7 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -171,7 +171,7 @@ static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
171 memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ 171 memset(header, 0xcc, sz); /* fill whole space with int3 instructions */
172 172
173 header->pages = sz / PAGE_SIZE; 173 header->pages = sz / PAGE_SIZE;
174 hole = sz - (proglen + sizeof(*header)); 174 hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header));
175 175
176 /* insert a random number of int3 instructions before BPF code */ 176 /* insert a random number of int3 instructions before BPF code */
177 *image_ptr = &header->image[prandom_u32() % hole]; 177 *image_ptr = &header->image[prandom_u32() % hole];
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 01edac6c5e18..5075371ab593 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -489,8 +489,12 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
489 } 489 }
490 490
491 node = acpi_get_node(device->handle); 491 node = acpi_get_node(device->handle);
492 if (node == NUMA_NO_NODE) 492 if (node == NUMA_NO_NODE) {
493 node = x86_pci_root_bus_node(busnum); 493 node = x86_pci_root_bus_node(busnum);
494 if (node != 0 && node != NUMA_NO_NODE)
495 dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
496 node);
497 }
494 498
495 if (node != NUMA_NO_NODE && !node_online(node)) 499 if (node != NUMA_NO_NODE && !node_online(node))
496 node = NUMA_NO_NODE; 500 node = NUMA_NO_NODE;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e88f4c53d7f6..c20d2cc7ef64 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -11,27 +11,33 @@
11 11
12#include "bus_numa.h" 12#include "bus_numa.h"
13 13
14/* 14#define AMD_NB_F0_NODE_ID 0x60
15 * This discovers the pcibus <-> node mapping on AMD K8. 15#define AMD_NB_F0_UNIT_ID 0x64
16 * also get peer root bus resource for io,mmio 16#define AMD_NB_F1_CONFIG_MAP_REG 0xe0
17 */ 17
18#define RANGE_NUM 16
19#define AMD_NB_F1_CONFIG_MAP_RANGES 4
18 20
19struct pci_hostbridge_probe { 21struct amd_hostbridge {
20 u32 bus; 22 u32 bus;
21 u32 slot; 23 u32 slot;
22 u32 vendor;
23 u32 device; 24 u32 device;
24}; 25};
25 26
26static struct pci_hostbridge_probe pci_probes[] __initdata = { 27/*
27 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, 28 * IMPORTANT NOTE:
28 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, 29 * hb_probes[] and early_root_info_init() is in maintenance mode.
29 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 30 * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh .
30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, 31 * Future processor will rely on information in ACPI.
32 */
33static struct amd_hostbridge hb_probes[] __initdata = {
34 { 0, 0x18, 0x1100 }, /* K8 */
35 { 0, 0x18, 0x1200 }, /* Family10h */
36 { 0xff, 0, 0x1200 }, /* Family10h */
37 { 0, 0x18, 0x1300 }, /* Family11h */
38 { 0, 0x18, 0x1600 }, /* Family15h */
31}; 39};
32 40
33#define RANGE_NUM 16
34
35static struct pci_root_info __init *find_pci_root_info(int node, int link) 41static struct pci_root_info __init *find_pci_root_info(int node, int link)
36{ 42{
37 struct pci_root_info *info; 43 struct pci_root_info *info;
@@ -45,12 +51,12 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
45} 51}
46 52
47/** 53/**
48 * early_fill_mp_bus_to_node() 54 * early_root_info_init()
49 * called before pcibios_scan_root and pci_scan_bus 55 * called before pcibios_scan_root and pci_scan_bus
50 * fills the mp_bus_to_cpumask array based according to the LDT Bus Number 56 * fills the mp_bus_to_cpumask array based according
51 * Registers found in the K8 northbridge 57 * to the LDT Bus Number Registers found in the northbridge.
52 */ 58 */
53static int __init early_fill_mp_bus_info(void) 59static int __init early_root_info_init(void)
54{ 60{
55 int i; 61 int i;
56 unsigned bus; 62 unsigned bus;
@@ -75,19 +81,21 @@ static int __init early_fill_mp_bus_info(void)
75 return -1; 81 return -1;
76 82
77 found = false; 83 found = false;
78 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 84 for (i = 0; i < ARRAY_SIZE(hb_probes); i++) {
79 u32 id; 85 u32 id;
80 u16 device; 86 u16 device;
81 u16 vendor; 87 u16 vendor;
82 88
83 bus = pci_probes[i].bus; 89 bus = hb_probes[i].bus;
84 slot = pci_probes[i].slot; 90 slot = hb_probes[i].slot;
85 id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); 91 id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
86
87 vendor = id & 0xffff; 92 vendor = id & 0xffff;
88 device = (id>>16) & 0xffff; 93 device = (id>>16) & 0xffff;
89 if (pci_probes[i].vendor == vendor && 94
90 pci_probes[i].device == device) { 95 if (vendor != PCI_VENDOR_ID_AMD)
96 continue;
97
98 if (hb_probes[i].device == device) {
91 found = true; 99 found = true;
92 break; 100 break;
93 } 101 }
@@ -96,10 +104,16 @@ static int __init early_fill_mp_bus_info(void)
96 if (!found) 104 if (!found)
97 return 0; 105 return 0;
98 106
99 for (i = 0; i < 4; i++) { 107 /*
108 * We should learn topology and routing information from _PXM and
109 * _CRS methods in the ACPI namespace. We extract node numbers
110 * here to work around BIOSes that don't supply _PXM.
111 */
112 for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
100 int min_bus; 113 int min_bus;
101 int max_bus; 114 int max_bus;
102 reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); 115 reg = read_pci_config(bus, slot, 1,
116 AMD_NB_F1_CONFIG_MAP_REG + (i << 2));
103 117
104 /* Check if that register is enabled for bus range */ 118 /* Check if that register is enabled for bus range */
105 if ((reg & 7) != 3) 119 if ((reg & 7) != 3)
@@ -113,10 +127,21 @@ static int __init early_fill_mp_bus_info(void)
113 info = alloc_pci_root_info(min_bus, max_bus, node, link); 127 info = alloc_pci_root_info(min_bus, max_bus, node, link);
114 } 128 }
115 129
130 /*
131 * The following code extracts routing information for use on old
132 * systems where Linux doesn't automatically use host bridge _CRS
133 * methods (or when the user specifies "pci=nocrs").
134 *
135 * We only do this through Fam11h, because _CRS should be enough on
136 * newer systems.
137 */
138 if (boot_cpu_data.x86 > 0x11)
139 return 0;
140
116 /* get the default node and link for left over res */ 141 /* get the default node and link for left over res */
117 reg = read_pci_config(bus, slot, 0, 0x60); 142 reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
118 def_node = (reg >> 8) & 0x07; 143 def_node = (reg >> 8) & 0x07;
119 reg = read_pci_config(bus, slot, 0, 0x64); 144 reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID);
120 def_link = (reg >> 8) & 0x03; 145 def_link = (reg >> 8) & 0x03;
121 146
122 memset(range, 0, sizeof(range)); 147 memset(range, 0, sizeof(range));
@@ -363,7 +388,7 @@ static int __init pci_io_ecs_init(void)
363 int cpu; 388 int cpu;
364 389
365 /* assume all cpus from fam10h have IO ECS */ 390 /* assume all cpus from fam10h have IO ECS */
366 if (boot_cpu_data.x86 < 0x10) 391 if (boot_cpu_data.x86 < 0x10)
367 return 0; 392 return 0;
368 393
369 /* Try the PCI method first. */ 394 /* Try the PCI method first. */
@@ -387,7 +412,7 @@ static int __init amd_postcore_init(void)
387 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 412 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
388 return 0; 413 return 0;
389 414
390 early_fill_mp_bus_info(); 415 early_root_info_init();
391 pci_io_ecs_init(); 416 pci_io_ecs_init();
392 417
393 return 0; 418 return 0;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 614392ced7d6..bb461cfd01ab 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -60,8 +60,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
60 word1 = read_pci_config_16(bus, slot, func, 0xc4); 60 word1 = read_pci_config_16(bus, slot, func, 0xc4);
61 word2 = read_pci_config_16(bus, slot, func, 0xc6); 61 word2 = read_pci_config_16(bus, slot, func, 0xc6);
62 if (word1 != word2) { 62 if (word1 != word2) {
63 res.start = (word1 << 16) | 0x0000; 63 res.start = ((resource_size_t) word1 << 16) | 0x0000;
64 res.end = (word2 << 16) | 0xffff; 64 res.end = ((resource_size_t) word2 << 16) | 0xffff;
65 res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH; 65 res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
66 update_res(info, res.start, res.end, res.flags, 0); 66 update_res(info, res.start, res.end, res.flags, 0);
67 } 67 }
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 94ae9ae9574f..b5e60268d93f 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/pci.h> 7#include <linux/pci.h>
8#include <linux/vgaarb.h> 8#include <linux/vgaarb.h>
9#include <asm/hpet.h>
9#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
10 11
11static void pci_fixup_i450nx(struct pci_dev *d) 12static void pci_fixup_i450nx(struct pci_dev *d)
@@ -337,9 +338,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
337 * type BRIDGE, or CARDBUS. Host to PCI controllers use 338 * type BRIDGE, or CARDBUS. Host to PCI controllers use
338 * PCI header type NORMAL. 339 * PCI header type NORMAL.
339 */ 340 */
340 if (bridge 341 if (bridge && (pci_is_bridge(bridge))) {
341 && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
342 || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
343 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, 342 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
344 &config); 343 &config);
345 if (!(config & PCI_BRIDGE_CTL_VGA)) 344 if (!(config & PCI_BRIDGE_CTL_VGA))
@@ -526,6 +525,19 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev)
526} 525}
527DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); 526DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
528 527
528#ifdef CONFIG_HPET_TIMER
529static void sb600_hpet_quirk(struct pci_dev *dev)
530{
531 struct resource *r = &dev->resource[1];
532
533 if (r->flags & IORESOURCE_MEM && r->start == hpet_address) {
534 r->flags |= IORESOURCE_PCI_FIXED;
535 dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n");
536 }
537}
538DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk);
539#endif
540
529/* 541/*
530 * Twinhead H12Y needs us to block out a region otherwise we map devices 542 * Twinhead H12Y needs us to block out a region otherwise we map devices
531 * there and any access kills the box. 543 * there and any access kills the box.
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index db6b1ab43255..a19ed92e74e4 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -271,11 +271,16 @@ static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass)
271 "BAR %d: reserving %pr (d=%d, p=%d)\n", 271 "BAR %d: reserving %pr (d=%d, p=%d)\n",
272 idx, r, disabled, pass); 272 idx, r, disabled, pass);
273 if (pci_claim_resource(dev, idx) < 0) { 273 if (pci_claim_resource(dev, idx) < 0) {
274 /* We'll assign a new address later */ 274 if (r->flags & IORESOURCE_PCI_FIXED) {
275 pcibios_save_fw_addr(dev, 275 dev_info(&dev->dev, "BAR %d %pR is immovable\n",
276 idx, r->start); 276 idx, r);
277 r->end -= r->start; 277 } else {
278 r->start = 0; 278 /* We'll assign a new address later */
279 pcibios_save_fw_addr(dev,
280 idx, r->start);
281 r->end -= r->start;
282 r->start = 0;
283 }
279 } 284 }
280 } 285 }
281 } 286 }
@@ -356,6 +361,12 @@ static int __init pcibios_assign_resources(void)
356 return 0; 361 return 0;
357} 362}
358 363
364/**
365 * called in fs_initcall (one below subsys_initcall),
366 * give a chance for motherboard reserve resources
367 */
368fs_initcall(pcibios_assign_resources);
369
359void pcibios_resource_survey_bus(struct pci_bus *bus) 370void pcibios_resource_survey_bus(struct pci_bus *bus)
360{ 371{
361 dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n"); 372 dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n");
@@ -392,12 +403,6 @@ void __init pcibios_resource_survey(void)
392 ioapic_insert_resources(); 403 ioapic_insert_resources();
393} 404}
394 405
395/**
396 * called in fs_initcall (one below subsys_initcall),
397 * give a chance for motherboard reserve resources
398 */
399fs_initcall(pcibios_assign_resources);
400
401static const struct vm_operations_struct pci_mmap_ops = { 406static const struct vm_operations_struct pci_mmap_ops = {
402 .access = generic_access_phys, 407 .access = generic_access_phys,
403}; 408};
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3497f14e4dea..7c0d7be176a5 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -52,8 +52,9 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
52OBJCOPYFLAGS_realmode.bin := -O binary 52OBJCOPYFLAGS_realmode.bin := -O binary
53 53
54targets += realmode.bin 54targets += realmode.bin
55$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs 55$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE
56 $(call if_changed,objcopy) 56 $(call if_changed,objcopy)
57 @:
57 58
58quiet_cmd_relocs = RELOCS $@ 59quiet_cmd_relocs = RELOCS $@
59 cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ 60 cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 00348980a3a6..e1f220e3ca68 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -39,6 +39,7 @@
39#ifdef CONFIG_X86_64 39#ifdef CONFIG_X86_64
40#define vdso_enabled sysctl_vsyscall32 40#define vdso_enabled sysctl_vsyscall32
41#define arch_setup_additional_pages syscall32_setup_pages 41#define arch_setup_additional_pages syscall32_setup_pages
42extern int sysctl_ldt16;
42#endif 43#endif
43 44
44/* 45/*
@@ -249,6 +250,13 @@ static struct ctl_table abi_table2[] = {
249 .mode = 0644, 250 .mode = 0644,
250 .proc_handler = proc_dointvec 251 .proc_handler = proc_dointvec
251 }, 252 },
253 {
254 .procname = "ldt16",
255 .data = &sysctl_ldt16,
256 .maxlen = sizeof(int),
257 .mode = 0644,
258 .proc_handler = proc_dointvec
259 },
252 {} 260 {}
253}; 261};
254 262
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c34bfc4bbe7f..f17b29210ac4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1339,6 +1339,7 @@ xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1339 1339
1340static struct notifier_block xen_panic_block = { 1340static struct notifier_block xen_panic_block = {
1341 .notifier_call= xen_panic_event, 1341 .notifier_call= xen_panic_event,
1342 .priority = INT_MIN
1342}; 1343};
1343 1344
1344int xen_panic_handler_init(void) 1345int xen_panic_handler_init(void)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 86e02eabb640..6f6e15d28466 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2510,6 +2510,95 @@ void __init xen_hvm_init_mmu_ops(void)
2510} 2510}
2511#endif 2511#endif
2512 2512
2513#ifdef CONFIG_XEN_PVH
2514/*
2515 * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
2516 * space creating new guest on pvh dom0 and needing to map domU pages.
2517 */
2518static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
2519 unsigned int domid)
2520{
2521 int rc, err = 0;
2522 xen_pfn_t gpfn = lpfn;
2523 xen_ulong_t idx = fgfn;
2524
2525 struct xen_add_to_physmap_range xatp = {
2526 .domid = DOMID_SELF,
2527 .foreign_domid = domid,
2528 .size = 1,
2529 .space = XENMAPSPACE_gmfn_foreign,
2530 };
2531 set_xen_guest_handle(xatp.idxs, &idx);
2532 set_xen_guest_handle(xatp.gpfns, &gpfn);
2533 set_xen_guest_handle(xatp.errs, &err);
2534
2535 rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
2536 if (rc < 0)
2537 return rc;
2538 return err;
2539}
2540
2541static int xlate_remove_from_p2m(unsigned long spfn, int count)
2542{
2543 struct xen_remove_from_physmap xrp;
2544 int i, rc;
2545
2546 for (i = 0; i < count; i++) {
2547 xrp.domid = DOMID_SELF;
2548 xrp.gpfn = spfn+i;
2549 rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
2550 if (rc)
2551 break;
2552 }
2553 return rc;
2554}
2555
2556struct xlate_remap_data {
2557 unsigned long fgfn; /* foreign domain's gfn */
2558 pgprot_t prot;
2559 domid_t domid;
2560 int index;
2561 struct page **pages;
2562};
2563
2564static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
2565 void *data)
2566{
2567 int rc;
2568 struct xlate_remap_data *remap = data;
2569 unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
2570 pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
2571
2572 rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
2573 if (rc)
2574 return rc;
2575 native_set_pte(ptep, pteval);
2576
2577 return 0;
2578}
2579
2580static int xlate_remap_gfn_range(struct vm_area_struct *vma,
2581 unsigned long addr, unsigned long mfn,
2582 int nr, pgprot_t prot, unsigned domid,
2583 struct page **pages)
2584{
2585 int err;
2586 struct xlate_remap_data pvhdata;
2587
2588 BUG_ON(!pages);
2589
2590 pvhdata.fgfn = mfn;
2591 pvhdata.prot = prot;
2592 pvhdata.domid = domid;
2593 pvhdata.index = 0;
2594 pvhdata.pages = pages;
2595 err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
2596 xlate_map_pte_fn, &pvhdata);
2597 flush_tlb_all();
2598 return err;
2599}
2600#endif
2601
2513#define REMAP_BATCH_SIZE 16 2602#define REMAP_BATCH_SIZE 16
2514 2603
2515struct remap_data { 2604struct remap_data {
@@ -2522,7 +2611,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2522 unsigned long addr, void *data) 2611 unsigned long addr, void *data)
2523{ 2612{
2524 struct remap_data *rmd = data; 2613 struct remap_data *rmd = data;
2525 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); 2614 pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
2526 2615
2527 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; 2616 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2528 rmd->mmu_update->val = pte_val_ma(pte); 2617 rmd->mmu_update->val = pte_val_ma(pte);
@@ -2544,13 +2633,18 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2544 unsigned long range; 2633 unsigned long range;
2545 int err = 0; 2634 int err = 0;
2546 2635
2547 if (xen_feature(XENFEAT_auto_translated_physmap))
2548 return -EINVAL;
2549
2550 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2551
2552 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); 2636 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2553 2637
2638 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2639#ifdef CONFIG_XEN_PVH
2640 /* We need to update the local page tables and the xen HAP */
2641 return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
2642 domid, pages);
2643#else
2644 return -EINVAL;
2645#endif
2646 }
2647
2554 rmd.mfn = mfn; 2648 rmd.mfn = mfn;
2555 rmd.prot = prot; 2649 rmd.prot = prot;
2556 2650
@@ -2588,6 +2682,25 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2588 if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) 2682 if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
2589 return 0; 2683 return 0;
2590 2684
2685#ifdef CONFIG_XEN_PVH
2686 while (numpgs--) {
2687 /*
2688 * The mmu has already cleaned up the process mmu
2689 * resources at this point (lookup_address will return
2690 * NULL).
2691 */
2692 unsigned long pfn = page_to_pfn(pages[numpgs]);
2693
2694 xlate_remove_from_p2m(pfn, 1);
2695 }
2696 /*
2697 * We don't need to flush tlbs because as part of
2698 * xlate_remove_from_p2m, the hypervisor will do tlb flushes
2699 * after removing the p2m entries from the EPT/NPT
2700 */
2701 return 0;
2702#else
2591 return -EINVAL; 2703 return -EINVAL;
2704#endif
2592} 2705}
2593EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); 2706EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 85e5d78c9874..9bb3d82ffec8 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -36,7 +36,7 @@
36 * pfn_to_mfn(0xc0000)=0xc0000 36 * pfn_to_mfn(0xc0000)=0xc0000
37 * 37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think 38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we 39 * PCI BARs, or ACPI spaces), we can create mappings easily because we
40 * get the PFN value to match the MFN. 40 * get the PFN value to match the MFN.
41 * 41 *
42 * For this to work efficiently we have one new page p2m_identity and 42 * For this to work efficiently we have one new page p2m_identity and
@@ -60,7 +60,7 @@
60 * There is also a digram of the P2M at the end that can help. 60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so: 61 * Imagine your E820 looking as so:
62 * 62 *
63 * 1GB 2GB 63 * 1GB 2GB 4GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\ 64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | 65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/ 66 * \-------------------+---------/\----/ \----------/ \---+-----/
@@ -77,9 +77,8 @@
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step 77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page 78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not 79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn 80 * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they 81 * required to split any existing p2m_mid_missing middle pages.
82 * point to p2m_mid_missing).
83 * 82 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a 83 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. 84 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
@@ -88,7 +87,7 @@
88 * Next stage is to determine if we need to do a more granular boundary check 87 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. 88 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if 89 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer 90 * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity. 91 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two 92 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" 93 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
@@ -102,9 +101,10 @@
102 * 101 *
103 * The next step is to walk from the start pfn to the end pfn setting 102 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. 103 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it 104 * If we find that the middle entry is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this 105 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
107 * point we do not need to worry about boundary aligment (so no need to 106 * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
107 * At this point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which 108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the 109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference 110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
@@ -118,6 +118,9 @@
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] 118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing." 119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 * 120 *
121 * Finally, the region beyond the end of of the E820 (4 GB in this example)
122 * is set to be identity (in case there are MMIO regions placed here).
123 *
121 * This is what the p2m ends up looking (for the E820 above) with this 124 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing: 125 * fabulous drawing:
123 * 126 *
@@ -129,21 +132,27 @@
129 * |-----| \ | [p2m_identity]+\\ | .... | 132 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/ 133 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\ 134 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity 135 * | 3 |-\ \ \\ p2m_identity [1]
133 * |-----| \ \-------------------->/---------------\ /-----------------\ 136 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | 137 * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 | 138 * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/ 139 * | | | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | 140 * | | +-[x], ~0, ~0.. +\
138 * / | IDENTITY[@256]|<----/ \---------------/ 141 * | | \---------------/ \
139 * / | ~0, ~0, .... | 142 * | | \-> /---------------\
140 * | \---------------/ 143 * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
141 * | 144 * | /-----------------\ /------------\ | IDENTITY[@256]|
142 * p2m_mid_missing p2m_missing 145 * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
143 * /-----------------\ /------------\ 146 * | | [p2m_missing] +---->| ..., ~0 | \---------------/
144 * | [p2m_missing] +---->| ~0, ~0, ~0 | 147 * | | ... | \------------/
145 * | [p2m_missing] +---->| ..., ~0 | 148 * | \-----------------/
146 * \-----------------/ \------------/ 149 * |
150 * | p2m_mid_identity
151 * | /-----------------\
152 * \-->| [p2m_identity] +---->[1]
153 * | [p2m_identity] +---->[1]
154 * | ... |
155 * \-----------------/
147 * 156 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) 157 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
149 */ 158 */
@@ -187,13 +196,15 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
187static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); 196static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
188 197
189static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); 198static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
199static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
200static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
190 201
191RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 202RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
192RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); 203RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
193 204
194/* We might hit two boundary violations at the start and end, at max each 205/* We might hit two boundary violations at the start and end, at max each
195 * boundary violation will require three middle nodes. */ 206 * boundary violation will require three middle nodes. */
196RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); 207RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
197 208
198/* When we populate back during bootup, the amount of pages can vary. The 209/* When we populate back during bootup, the amount of pages can vary. The
199 * max we have is seen is 395979, but that does not mean it can't be more. 210 * max we have is seen is 395979, but that does not mean it can't be more.
@@ -242,20 +253,20 @@ static void p2m_top_mfn_p_init(unsigned long **top)
242 top[i] = p2m_mid_missing_mfn; 253 top[i] = p2m_mid_missing_mfn;
243} 254}
244 255
245static void p2m_mid_init(unsigned long **mid) 256static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
246{ 257{
247 unsigned i; 258 unsigned i;
248 259
249 for (i = 0; i < P2M_MID_PER_PAGE; i++) 260 for (i = 0; i < P2M_MID_PER_PAGE; i++)
250 mid[i] = p2m_missing; 261 mid[i] = leaf;
251} 262}
252 263
253static void p2m_mid_mfn_init(unsigned long *mid) 264static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
254{ 265{
255 unsigned i; 266 unsigned i;
256 267
257 for (i = 0; i < P2M_MID_PER_PAGE; i++) 268 for (i = 0; i < P2M_MID_PER_PAGE; i++)
258 mid[i] = virt_to_mfn(p2m_missing); 269 mid[i] = virt_to_mfn(leaf);
259} 270}
260 271
261static void p2m_init(unsigned long *p2m) 272static void p2m_init(unsigned long *p2m)
@@ -286,7 +297,9 @@ void __ref xen_build_mfn_list_list(void)
286 /* Pre-initialize p2m_top_mfn to be completely missing */ 297 /* Pre-initialize p2m_top_mfn to be completely missing */
287 if (p2m_top_mfn == NULL) { 298 if (p2m_top_mfn == NULL) {
288 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); 299 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
289 p2m_mid_mfn_init(p2m_mid_missing_mfn); 300 p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
301 p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
302 p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
290 303
291 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); 304 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
292 p2m_top_mfn_p_init(p2m_top_mfn_p); 305 p2m_top_mfn_p_init(p2m_top_mfn_p);
@@ -295,7 +308,8 @@ void __ref xen_build_mfn_list_list(void)
295 p2m_top_mfn_init(p2m_top_mfn); 308 p2m_top_mfn_init(p2m_top_mfn);
296 } else { 309 } else {
297 /* Reinitialise, mfn's all change after migration */ 310 /* Reinitialise, mfn's all change after migration */
298 p2m_mid_mfn_init(p2m_mid_missing_mfn); 311 p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
312 p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
299 } 313 }
300 314
301 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { 315 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
@@ -327,7 +341,7 @@ void __ref xen_build_mfn_list_list(void)
327 * it too late. 341 * it too late.
328 */ 342 */
329 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); 343 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
330 p2m_mid_mfn_init(mid_mfn_p); 344 p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
331 345
332 p2m_top_mfn_p[topidx] = mid_mfn_p; 346 p2m_top_mfn_p[topidx] = mid_mfn_p;
333 } 347 }
@@ -365,16 +379,17 @@ void __init xen_build_dynamic_phys_to_machine(void)
365 379
366 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); 380 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
367 p2m_init(p2m_missing); 381 p2m_init(p2m_missing);
382 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
383 p2m_init(p2m_identity);
368 384
369 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); 385 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
370 p2m_mid_init(p2m_mid_missing); 386 p2m_mid_init(p2m_mid_missing, p2m_missing);
387 p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
388 p2m_mid_init(p2m_mid_identity, p2m_identity);
371 389
372 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 390 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
373 p2m_top_init(p2m_top); 391 p2m_top_init(p2m_top);
374 392
375 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
376 p2m_init(p2m_identity);
377
378 /* 393 /*
379 * The domain builder gives us a pre-constructed p2m array in 394 * The domain builder gives us a pre-constructed p2m array in
380 * mfn_list for all the pages initially given to us, so we just 395 * mfn_list for all the pages initially given to us, so we just
@@ -386,7 +401,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
386 401
387 if (p2m_top[topidx] == p2m_mid_missing) { 402 if (p2m_top[topidx] == p2m_mid_missing) {
388 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); 403 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
389 p2m_mid_init(mid); 404 p2m_mid_init(mid, p2m_missing);
390 405
391 p2m_top[topidx] = mid; 406 p2m_top[topidx] = mid;
392 } 407 }
@@ -492,7 +507,7 @@ unsigned long get_phys_to_machine(unsigned long pfn)
492 unsigned topidx, mididx, idx; 507 unsigned topidx, mididx, idx;
493 508
494 if (unlikely(pfn >= MAX_P2M_PFN)) 509 if (unlikely(pfn >= MAX_P2M_PFN))
495 return INVALID_P2M_ENTRY; 510 return IDENTITY_FRAME(pfn);
496 511
497 topidx = p2m_top_index(pfn); 512 topidx = p2m_top_index(pfn);
498 mididx = p2m_mid_index(pfn); 513 mididx = p2m_mid_index(pfn);
@@ -545,7 +560,7 @@ static bool alloc_p2m(unsigned long pfn)
545 if (!mid) 560 if (!mid)
546 return false; 561 return false;
547 562
548 p2m_mid_init(mid); 563 p2m_mid_init(mid, p2m_missing);
549 564
550 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) 565 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
551 free_p2m_page(mid); 566 free_p2m_page(mid);
@@ -565,7 +580,7 @@ static bool alloc_p2m(unsigned long pfn)
565 if (!mid_mfn) 580 if (!mid_mfn)
566 return false; 581 return false;
567 582
568 p2m_mid_mfn_init(mid_mfn); 583 p2m_mid_mfn_init(mid_mfn, p2m_missing);
569 584
570 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); 585 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
571 mid_mfn_mfn = virt_to_mfn(mid_mfn); 586 mid_mfn_mfn = virt_to_mfn(mid_mfn);
@@ -596,7 +611,7 @@ static bool alloc_p2m(unsigned long pfn)
596 return true; 611 return true;
597} 612}
598 613
599static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) 614static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
600{ 615{
601 unsigned topidx, mididx, idx; 616 unsigned topidx, mididx, idx;
602 unsigned long *p2m; 617 unsigned long *p2m;
@@ -638,7 +653,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary
638 return true; 653 return true;
639} 654}
640 655
641static bool __init early_alloc_p2m(unsigned long pfn) 656static bool __init early_alloc_p2m_middle(unsigned long pfn)
642{ 657{
643 unsigned topidx = p2m_top_index(pfn); 658 unsigned topidx = p2m_top_index(pfn);
644 unsigned long *mid_mfn_p; 659 unsigned long *mid_mfn_p;
@@ -649,7 +664,7 @@ static bool __init early_alloc_p2m(unsigned long pfn)
649 if (mid == p2m_mid_missing) { 664 if (mid == p2m_mid_missing) {
650 mid = extend_brk(PAGE_SIZE, PAGE_SIZE); 665 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
651 666
652 p2m_mid_init(mid); 667 p2m_mid_init(mid, p2m_missing);
653 668
654 p2m_top[topidx] = mid; 669 p2m_top[topidx] = mid;
655 670
@@ -658,12 +673,12 @@ static bool __init early_alloc_p2m(unsigned long pfn)
658 /* And the save/restore P2M tables.. */ 673 /* And the save/restore P2M tables.. */
659 if (mid_mfn_p == p2m_mid_missing_mfn) { 674 if (mid_mfn_p == p2m_mid_missing_mfn) {
660 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); 675 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
661 p2m_mid_mfn_init(mid_mfn_p); 676 p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
662 677
663 p2m_top_mfn_p[topidx] = mid_mfn_p; 678 p2m_top_mfn_p[topidx] = mid_mfn_p;
664 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); 679 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
665 /* Note: we don't set mid_mfn_p[midix] here, 680 /* Note: we don't set mid_mfn_p[midix] here,
666 * look in early_alloc_p2m_middle */ 681 * look in early_alloc_p2m() */
667 } 682 }
668 return true; 683 return true;
669} 684}
@@ -739,7 +754,7 @@ found:
739 754
740 /* This shouldn't happen */ 755 /* This shouldn't happen */
741 if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) 756 if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
742 early_alloc_p2m(set_pfn); 757 early_alloc_p2m_middle(set_pfn);
743 758
744 if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) 759 if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
745 return false; 760 return false;
@@ -754,13 +769,13 @@ found:
754bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) 769bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
755{ 770{
756 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 771 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
757 if (!early_alloc_p2m(pfn)) 772 if (!early_alloc_p2m_middle(pfn))
758 return false; 773 return false;
759 774
760 if (early_can_reuse_p2m_middle(pfn, mfn)) 775 if (early_can_reuse_p2m_middle(pfn, mfn))
761 return __set_phys_to_machine(pfn, mfn); 776 return __set_phys_to_machine(pfn, mfn);
762 777
763 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) 778 if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
764 return false; 779 return false;
765 780
766 if (!__set_phys_to_machine(pfn, mfn)) 781 if (!__set_phys_to_machine(pfn, mfn))
@@ -769,12 +784,30 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
769 784
770 return true; 785 return true;
771} 786}
787
788static void __init early_split_p2m(unsigned long pfn)
789{
790 unsigned long mididx, idx;
791
792 mididx = p2m_mid_index(pfn);
793 idx = p2m_index(pfn);
794
795 /*
796 * Allocate new middle and leaf pages if this pfn lies in the
797 * middle of one.
798 */
799 if (mididx || idx)
800 early_alloc_p2m_middle(pfn);
801 if (idx)
802 early_alloc_p2m(pfn, false);
803}
804
772unsigned long __init set_phys_range_identity(unsigned long pfn_s, 805unsigned long __init set_phys_range_identity(unsigned long pfn_s,
773 unsigned long pfn_e) 806 unsigned long pfn_e)
774{ 807{
775 unsigned long pfn; 808 unsigned long pfn;
776 809
777 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) 810 if (unlikely(pfn_s >= MAX_P2M_PFN))
778 return 0; 811 return 0;
779 812
780 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) 813 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
@@ -783,19 +816,30 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
783 if (pfn_s > pfn_e) 816 if (pfn_s > pfn_e)
784 return 0; 817 return 0;
785 818
786 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); 819 if (pfn_e > MAX_P2M_PFN)
787 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); 820 pfn_e = MAX_P2M_PFN;
788 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
789 {
790 WARN_ON(!early_alloc_p2m(pfn));
791 }
792 821
793 early_alloc_p2m_middle(pfn_s, true); 822 early_split_p2m(pfn_s);
794 early_alloc_p2m_middle(pfn_e, true); 823 early_split_p2m(pfn_e);
824
825 for (pfn = pfn_s; pfn < pfn_e;) {
826 unsigned topidx = p2m_top_index(pfn);
827 unsigned mididx = p2m_mid_index(pfn);
795 828
796 for (pfn = pfn_s; pfn < pfn_e; pfn++)
797 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) 829 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
798 break; 830 break;
831 pfn++;
832
833 /*
834 * If the PFN was set to a middle or leaf identity
835 * page the remainder must also be identity, so skip
836 * ahead to the next middle or leaf entry.
837 */
838 if (p2m_top[topidx] == p2m_mid_identity)
839 pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
840 else if (p2m_top[topidx][mididx] == p2m_identity)
841 pfn = ALIGN(pfn, P2M_PER_PAGE);
842 }
799 843
800 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), 844 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
801 "Identity mapping failed. We are %ld short of 1-1 mappings!\n", 845 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
@@ -825,8 +869,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
825 869
826 /* For sparse holes were the p2m leaf has real PFN along with 870 /* For sparse holes were the p2m leaf has real PFN along with
827 * PCI holes, stick in the PFN as the MFN value. 871 * PCI holes, stick in the PFN as the MFN value.
872 *
873 * set_phys_range_identity() will have allocated new middle
874 * and leaf pages as required so an existing p2m_mid_missing
875 * or p2m_missing mean that whole range will be identity so
876 * these can be switched to p2m_mid_identity or p2m_identity.
828 */ 877 */
829 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { 878 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
879 if (p2m_top[topidx] == p2m_mid_identity)
880 return true;
881
882 if (p2m_top[topidx] == p2m_mid_missing) {
883 WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
884 p2m_mid_identity) != p2m_mid_missing);
885 return true;
886 }
887
830 if (p2m_top[topidx][mididx] == p2m_identity) 888 if (p2m_top[topidx][mididx] == p2m_identity)
831 return true; 889 return true;
832 890
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233b9b84..210426a26cc0 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -89,10 +89,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
89 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 89 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
90 unsigned long mfn = pfn_to_mfn(pfn); 90 unsigned long mfn = pfn_to_mfn(pfn);
91 91
92 if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) 92 if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
93 continue; 93 continue;
94 WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", 94 WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
95 pfn, mfn); 95 pfn, mfn);
96 96
97 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 97 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
98 } 98 }
@@ -469,6 +469,15 @@ char * __init xen_memory_setup(void)
469 } 469 }
470 470
471 /* 471 /*
472 * Set the rest as identity mapped, in case PCI BARs are
473 * located here.
474 *
475 * PFNs above MAX_P2M_PFN are considered identity mapped as
476 * well.
477 */
478 set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
479
480 /*
472 * In domU, the ISA region is normal, usable memory, but we 481 * In domU, the ISA region is normal, usable memory, but we
473 * reserve ISA memory anyway because too many things poke 482 * reserve ISA memory anyway because too many things poke
474 * about in there. 483 * about in there.
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226e..c4df9dbd63b7 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,8 +12,10 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_arch_pre_suspend(void) 15static void xen_pv_pre_suspend(void)
16{ 16{
17 xen_mm_pin_all();
18
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 19 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 20 xen_start_info->console.domU.mfn =
19 mfn_to_pfn(xen_start_info->console.domU.mfn); 21 mfn_to_pfn(xen_start_info->console.domU.mfn);
@@ -26,7 +28,7 @@ void xen_arch_pre_suspend(void)
26 BUG(); 28 BUG();
27} 29}
28 30
29void xen_arch_hvm_post_suspend(int suspend_cancelled) 31static void xen_hvm_post_suspend(int suspend_cancelled)
30{ 32{
31#ifdef CONFIG_XEN_PVHVM 33#ifdef CONFIG_XEN_PVHVM
32 int cpu; 34 int cpu;
@@ -41,7 +43,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
41#endif 43#endif
42} 44}
43 45
44void xen_arch_post_suspend(int suspend_cancelled) 46static void xen_pv_post_suspend(int suspend_cancelled)
45{ 47{
46 xen_build_mfn_list_list(); 48 xen_build_mfn_list_list();
47 49
@@ -60,6 +62,21 @@ void xen_arch_post_suspend(int suspend_cancelled)
60 xen_vcpu_restore(); 62 xen_vcpu_restore();
61 } 63 }
62 64
65 xen_mm_unpin_all();
66}
67
68void xen_arch_pre_suspend(void)
69{
70 if (xen_pv_domain())
71 xen_pv_pre_suspend();
72}
73
74void xen_arch_post_suspend(int cancelled)
75{
76 if (xen_pv_domain())
77 xen_pv_post_suspend(cancelled);
78 else
79 xen_hvm_post_suspend(cancelled);
63} 80}
64 81
65static void xen_vcpu_notify_restore(void *data) 82static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1cb6f4c37300..c834d4b231f0 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -31,6 +31,8 @@ void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32extern unsigned long xen_max_p2m_pfn; 32extern unsigned long xen_max_p2m_pfn;
33 33
34void xen_mm_pin_all(void);
35void xen_mm_unpin_all(void);
34void xen_set_pat(u64); 36void xen_set_pat(u64);
35 37
36char * __init xen_memory_setup(void); 38char * __init xen_memory_setup(void);