aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/include/asm/mce.h8
-rw-r--r--arch/x86/include/asm/perf_event.h11
-rw-r--r--arch/x86/include/asm/spinlock.h3
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/acpi/sleep.h2
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S4
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c14
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c43
-rw-r--r--arch/x86/kernel/cpu/perf_event.c89
-rw-r--r--arch/x86/kernel/cpu/perf_event.h20
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c253
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h48
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/kdebugfs.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c7
-rw-r--r--arch/x86/kvm/emulate.c30
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/mmu.c13
-rw-r--r--arch/x86/kvm/vmx.c20
-rw-r--r--arch/x86/kvm/x86.c9
-rw-r--r--arch/x86/mm/hugetlbpage.c21
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/srat.c15
-rw-r--r--arch/x86/platform/efi/efi.c30
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl8
-rw-r--r--arch/x86/xen/enlighten.c118
-rw-r--r--arch/x86/xen/mmu.c2
-rw-r--r--arch/x86/xen/p2m.c94
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
42 files changed, 611 insertions, 352 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba2657c49217..8ec3a1aa4abd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1527,7 +1527,7 @@ config SECCOMP
1527 If unsure, say Y. Only embedded should say N here. 1527 If unsure, say Y. Only embedded should say N here.
1528 1528
1529config CC_STACKPROTECTOR 1529config CC_STACKPROTECTOR
1530 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" 1530 bool "Enable -fstack-protector buffer overflow detection"
1531 ---help--- 1531 ---help---
1532 This option turns on the -fstack-protector GCC feature. This 1532 This option turns on the -fstack-protector GCC feature. This
1533 feature puts, at the beginning of functions, a canary value on 1533 feature puts, at the beginning of functions, a canary value on
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b0c5276861ec..682e9c210baa 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -27,6 +27,10 @@ ifeq ($(CONFIG_X86_32),y)
27 27
28 KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return 28 KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
29 29
30 # Never want PIC in a 32-bit kernel, prevent breakage with GCC built
31 # with nonstandard options
32 KBUILD_CFLAGS += -fno-pic
33
30 # prevent gcc from keeping the stack 16 byte aligned 34 # prevent gcc from keeping the stack 16 byte aligned
31 KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) 35 KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
32 36
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 5a747dd884db..f7535bedc33f 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -57,7 +57,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
57 -Wall -Wstrict-prototypes \ 57 -Wall -Wstrict-prototypes \
58 -march=i386 -mregparm=3 \ 58 -march=i386 -mregparm=3 \
59 -include $(srctree)/$(src)/code16gcc.h \ 59 -include $(srctree)/$(src)/code16gcc.h \
60 -fno-strict-aliasing -fomit-frame-pointer \ 60 -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
61 $(call cc-option, -ffreestanding) \ 61 $(call cc-option, -ffreestanding) \
62 $(call cc-option, -fno-toplevel-reorder,\ 62 $(call cc-option, -fno-toplevel-reorder,\
63 $(call cc-option, -fno-unit-at-a-time)) \ 63 $(call cc-option, -fno-unit-at-a-time)) \
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 441520e4174f..a3ac52b29cbf 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -33,6 +33,14 @@
33#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ 33#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ 34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
35#define MCI_STATUS_AR (1ULL<<55) /* Action required */ 35#define MCI_STATUS_AR (1ULL<<55) /* Action required */
36#define MCACOD 0xffff /* MCA Error Code */
37
38/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
39#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
40#define MCACOD_SCRUBMSK 0xfff0
41#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
42#define MCACOD_DATA 0x0134 /* Data Load */
43#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
36 44
37/* MCi_MISC register defines */ 45/* MCi_MISC register defines */
38#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f) 46#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index dab39350e51e..cb4e43bce98a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -196,11 +196,16 @@ static inline u32 get_ibs_caps(void) { return 0; }
196extern void perf_events_lapic_init(void); 196extern void perf_events_lapic_init(void);
197 197
198/* 198/*
199 * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. 199 * Abuse bits {3,5} of the cpu eflags register. These flags are otherwise
200 * This flag is otherwise unused and ABI specified to be 0, so nobody should 200 * unused and ABI specified to be 0, so nobody should care what we do with
201 * care what we do with it. 201 * them.
202 *
203 * EXACT - the IP points to the exact instruction that triggered the
204 * event (HW bugs exempt).
205 * VM - original X86_VM_MASK; see set_linear_ip().
202 */ 206 */
203#define PERF_EFLAGS_EXACT (1UL << 3) 207#define PERF_EFLAGS_EXACT (1UL << 3)
208#define PERF_EFLAGS_VM (1UL << 5)
204 209
205struct pt_regs; 210struct pt_regs;
206extern unsigned long perf_instruction_pointer(struct pt_regs *regs); 211extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b315a33867f2..33692eaabab5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -12,8 +12,7 @@
12 * Simple spin lock operations. There are two variants, one clears IRQ's 12 * Simple spin lock operations. There are two variants, one clears IRQ's
13 * on the local processor, one does not. 13 * on the local processor, one does not.
14 * 14 *
15 * These are fair FIFO ticket locks, which are currently limited to 256 15 * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
16 * CPUs.
17 * 16 *
18 * (the type definitions are in asm/spinlock_types.h) 17 * (the type definitions are in asm/spinlock_types.h)
19 */ 18 */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 95bf99de9058..1b8e5a03d942 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -25,10 +25,6 @@ unsigned long acpi_realmode_flags;
25static char temp_stack[4096]; 25static char temp_stack[4096];
26#endif 26#endif
27 27
28asmlinkage void acpi_enter_s3(void)
29{
30 acpi_enter_sleep_state(3, wake_sleep_flags);
31}
32/** 28/**
33 * acpi_suspend_lowlevel - save kernel state 29 * acpi_suspend_lowlevel - save kernel state
34 * 30 *
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 5653a5791ec9..67f59f8c6956 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,7 +2,6 @@
2 * Variables and functions used by the code in sleep.c 2 * Variables and functions used by the code in sleep.c
3 */ 3 */
4 4
5#include <linux/linkage.h>
6#include <asm/realmode.h> 5#include <asm/realmode.h>
7 6
8extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
@@ -11,7 +10,6 @@ extern long saved_magic;
11extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
12 11
13extern u8 wake_sleep_flags; 12extern u8 wake_sleep_flags;
14extern asmlinkage void acpi_enter_s3(void);
15 13
16extern unsigned long acpi_copy_wakeup_routine(unsigned long); 14extern unsigned long acpi_copy_wakeup_routine(unsigned long);
17extern void wakeup_long64(void); 15extern void wakeup_long64(void);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 72610839f03b..13ab720573e3 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -74,7 +74,9 @@ restore_registers:
74ENTRY(do_suspend_lowlevel) 74ENTRY(do_suspend_lowlevel)
75 call save_processor_state 75 call save_processor_state
76 call save_registers 76 call save_registers
77 call acpi_enter_s3 77 pushl $3
78 call acpi_enter_sleep_state
79 addl $4, %esp
78 80
79# In case of S3 failure, we'll emerge here. Jump 81# In case of S3 failure, we'll emerge here. Jump
80# to ret_point to recover 82# to ret_point to recover
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 014d1d28c397..8ea5164cbd04 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -71,7 +71,9 @@ ENTRY(do_suspend_lowlevel)
71 movq %rsi, saved_rsi 71 movq %rsi, saved_rsi
72 72
73 addq $8, %rsp 73 addq $8, %rsp
74 call acpi_enter_s3 74 movl $3, %edi
75 xorl %eax, %eax
76 call acpi_enter_sleep_state
75 /* in case something went wrong, restore the machine status and go on */ 77 /* in case something went wrong, restore the machine status and go on */
76 jmp resume_point 78 jmp resume_point
77 79
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 931280ff8299..ced4534baed5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
165#endif 165#endif
166 166
167#ifdef P6_NOP1 167#ifdef P6_NOP1
168static const unsigned char __initconst_or_module p6nops[] = 168static const unsigned char p6nops[] =
169{ 169{
170 P6_NOP1, 170 P6_NOP1,
171 P6_NOP2, 171 P6_NOP2,
@@ -224,7 +224,7 @@ void __init arch_init_ideal_nops(void)
224 ideal_nops = intel_nops; 224 ideal_nops = intel_nops;
225#endif 225#endif
226 } 226 }
227 227 break;
228 default: 228 default:
229#ifdef CONFIG_X86_64 229#ifdef CONFIG_X86_64
230 ideal_nops = k8_nops; 230 ideal_nops = k8_nops;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 406eee784684..c265593ec2cd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1204,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1204 BUG_ON(!cfg->vector); 1204 BUG_ON(!cfg->vector);
1205 1205
1206 vector = cfg->vector; 1206 vector = cfg->vector;
1207 for_each_cpu(cpu, cfg->domain) 1207 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1208 per_cpu(vector_irq, cpu)[vector] = -1; 1208 per_cpu(vector_irq, cpu)[vector] = -1;
1209 1209
1210 cfg->vector = 0; 1210 cfg->vector = 0;
@@ -1212,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1212 1212
1213 if (likely(!cfg->move_in_progress)) 1213 if (likely(!cfg->move_in_progress))
1214 return; 1214 return;
1215 for_each_cpu(cpu, cfg->old_domain) { 1215 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1216 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1216 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
1217 vector++) { 1217 vector++) {
1218 if (per_cpu(vector_irq, cpu)[vector] != irq) 1218 if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1356,6 +1356,16 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1356 if (!IO_APIC_IRQ(irq)) 1356 if (!IO_APIC_IRQ(irq))
1357 return; 1357 return;
1358 1358
1359 /*
1360 * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
1361 * can handle this irq and the apic driver is finialized at this point,
1362 * update the cfg->domain.
1363 */
1364 if (irq < legacy_pic->nr_legacy_irqs &&
1365 cpumask_equal(cfg->domain, cpumask_of(0)))
1366 apic->vector_allocation_domain(0, cfg->domain,
1367 apic->target_cpus());
1368
1359 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1369 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1360 return; 1370 return;
1361 1371
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 46d8786d655e..a5fbc3c5fccc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -144,6 +144,8 @@ static int __init x86_xsave_setup(char *s)
144{ 144{
145 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 145 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
146 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); 146 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
147 setup_clear_cpu_cap(X86_FEATURE_AVX);
148 setup_clear_cpu_cap(X86_FEATURE_AVX2);
147 return 1; 149 return 1;
148} 150}
149__setup("noxsave", x86_xsave_setup); 151__setup("noxsave", x86_xsave_setup);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 413c2ced887c..13017626f9a8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -55,13 +55,6 @@ static struct severity {
55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
57#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) 57#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
58#define MCACOD 0xffff
59/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
60#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
61#define MCACOD_SCRUBMSK 0xfff0
62#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
63#define MCACOD_DATA 0x0134 /* Data Load */
64#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
65 58
66 MCESEV( 59 MCESEV(
67 NO, "Invalid", 60 NO, "Invalid",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5e095f873e3e..292d0258311c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -103,6 +103,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
103 103
104static DEFINE_PER_CPU(struct work_struct, mce_work); 104static DEFINE_PER_CPU(struct work_struct, mce_work);
105 105
106static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
107
106/* 108/*
107 * CPU/chipset specific EDAC code can register a notifier call here to print 109 * CPU/chipset specific EDAC code can register a notifier call here to print
108 * MCE errors in a human-readable form. 110 * MCE errors in a human-readable form.
@@ -650,14 +652,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
650 * Do a quick check if any of the events requires a panic. 652 * Do a quick check if any of the events requires a panic.
651 * This decides if we keep the events around or clear them. 653 * This decides if we keep the events around or clear them.
652 */ 654 */
653static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) 655static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
656 struct pt_regs *regs)
654{ 657{
655 int i, ret = 0; 658 int i, ret = 0;
656 659
657 for (i = 0; i < banks; i++) { 660 for (i = 0; i < banks; i++) {
658 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 661 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
659 if (m->status & MCI_STATUS_VAL) 662 if (m->status & MCI_STATUS_VAL) {
660 __set_bit(i, validp); 663 __set_bit(i, validp);
664 if (quirk_no_way_out)
665 quirk_no_way_out(i, m, regs);
666 }
661 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 667 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
662 ret = 1; 668 ret = 1;
663 } 669 }
@@ -1040,7 +1046,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1040 *final = m; 1046 *final = m;
1041 1047
1042 memset(valid_banks, 0, sizeof(valid_banks)); 1048 memset(valid_banks, 0, sizeof(valid_banks));
1043 no_way_out = mce_no_way_out(&m, &msg, valid_banks); 1049 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1044 1050
1045 barrier(); 1051 barrier();
1046 1052
@@ -1418,6 +1424,34 @@ static void __mcheck_cpu_init_generic(void)
1418 } 1424 }
1419} 1425}
1420 1426
1427/*
1428 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1429 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1430 * Vol 3B Table 15-20). But this confuses both the code that determines
1431 * whether the machine check occurred in kernel or user mode, and also
1432 * the severity assessment code. Pretend that EIPV was set, and take the
1433 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1434 */
1435static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1436{
1437 if (bank != 0)
1438 return;
1439 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1440 return;
1441 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1442 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1443 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1444 MCACOD)) !=
1445 (MCI_STATUS_UC|MCI_STATUS_EN|
1446 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1447 MCI_STATUS_AR|MCACOD_INSTR))
1448 return;
1449
1450 m->mcgstatus |= MCG_STATUS_EIPV;
1451 m->ip = regs->ip;
1452 m->cs = regs->cs;
1453}
1454
1421/* Add per CPU specific workarounds here */ 1455/* Add per CPU specific workarounds here */
1422static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1456static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1423{ 1457{
@@ -1515,6 +1549,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1515 */ 1549 */
1516 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1550 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1517 mce_bootlog = 0; 1551 mce_bootlog = 0;
1552
1553 if (c->x86 == 6 && c->x86_model == 45)
1554 quirk_no_way_out = quirk_sandybridge_ifu;
1518 } 1555 }
1519 if (monarch_timeout < 0) 1556 if (monarch_timeout < 0)
1520 monarch_timeout = 0; 1557 monarch_timeout = 0;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 29557aa06dda..915b876edd1e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -32,6 +32,8 @@
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/alternative.h> 33#include <asm/alternative.h>
34#include <asm/timer.h> 34#include <asm/timer.h>
35#include <asm/desc.h>
36#include <asm/ldt.h>
35 37
36#include "perf_event.h" 38#include "perf_event.h"
37 39
@@ -1738,6 +1740,29 @@ valid_user_frame(const void __user *fp, unsigned long size)
1738 return (__range_not_ok(fp, size, TASK_SIZE) == 0); 1740 return (__range_not_ok(fp, size, TASK_SIZE) == 0);
1739} 1741}
1740 1742
1743static unsigned long get_segment_base(unsigned int segment)
1744{
1745 struct desc_struct *desc;
1746 int idx = segment >> 3;
1747
1748 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1749 if (idx > LDT_ENTRIES)
1750 return 0;
1751
1752 if (idx > current->active_mm->context.size)
1753 return 0;
1754
1755 desc = current->active_mm->context.ldt;
1756 } else {
1757 if (idx > GDT_ENTRIES)
1758 return 0;
1759
1760 desc = __this_cpu_ptr(&gdt_page.gdt[0]);
1761 }
1762
1763 return get_desc_base(desc + idx);
1764}
1765
1741#ifdef CONFIG_COMPAT 1766#ifdef CONFIG_COMPAT
1742 1767
1743#include <asm/compat.h> 1768#include <asm/compat.h>
@@ -1746,13 +1771,17 @@ static inline int
1746perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1771perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1747{ 1772{
1748 /* 32-bit process in 64-bit kernel. */ 1773 /* 32-bit process in 64-bit kernel. */
1774 unsigned long ss_base, cs_base;
1749 struct stack_frame_ia32 frame; 1775 struct stack_frame_ia32 frame;
1750 const void __user *fp; 1776 const void __user *fp;
1751 1777
1752 if (!test_thread_flag(TIF_IA32)) 1778 if (!test_thread_flag(TIF_IA32))
1753 return 0; 1779 return 0;
1754 1780
1755 fp = compat_ptr(regs->bp); 1781 cs_base = get_segment_base(regs->cs);
1782 ss_base = get_segment_base(regs->ss);
1783
1784 fp = compat_ptr(ss_base + regs->bp);
1756 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1785 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1757 unsigned long bytes; 1786 unsigned long bytes;
1758 frame.next_frame = 0; 1787 frame.next_frame = 0;
@@ -1765,8 +1794,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1765 if (!valid_user_frame(fp, sizeof(frame))) 1794 if (!valid_user_frame(fp, sizeof(frame)))
1766 break; 1795 break;
1767 1796
1768 perf_callchain_store(entry, frame.return_address); 1797 perf_callchain_store(entry, cs_base + frame.return_address);
1769 fp = compat_ptr(frame.next_frame); 1798 fp = compat_ptr(ss_base + frame.next_frame);
1770 } 1799 }
1771 return 1; 1800 return 1;
1772} 1801}
@@ -1789,6 +1818,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1789 return; 1818 return;
1790 } 1819 }
1791 1820
1821 /*
1822 * We don't know what to do with VM86 stacks.. ignore them for now.
1823 */
1824 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
1825 return;
1826
1792 fp = (void __user *)regs->bp; 1827 fp = (void __user *)regs->bp;
1793 1828
1794 perf_callchain_store(entry, regs->ip); 1829 perf_callchain_store(entry, regs->ip);
@@ -1816,16 +1851,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1816 } 1851 }
1817} 1852}
1818 1853
1819unsigned long perf_instruction_pointer(struct pt_regs *regs) 1854/*
1855 * Deal with code segment offsets for the various execution modes:
1856 *
1857 * VM86 - the good olde 16 bit days, where the linear address is
1858 * 20 bits and we use regs->ip + 0x10 * regs->cs.
1859 *
1860 * IA32 - Where we need to look at GDT/LDT segment descriptor tables
1861 * to figure out what the 32bit base address is.
1862 *
1863 * X32 - has TIF_X32 set, but is running in x86_64
1864 *
1865 * X86_64 - CS,DS,SS,ES are all zero based.
1866 */
1867static unsigned long code_segment_base(struct pt_regs *regs)
1820{ 1868{
1821 unsigned long ip; 1869 /*
1870 * If we are in VM86 mode, add the segment offset to convert to a
1871 * linear address.
1872 */
1873 if (regs->flags & X86_VM_MASK)
1874 return 0x10 * regs->cs;
1875
1876 /*
1877 * For IA32 we look at the GDT/LDT segment base to convert the
1878 * effective IP to a linear address.
1879 */
1880#ifdef CONFIG_X86_32
1881 if (user_mode(regs) && regs->cs != __USER_CS)
1882 return get_segment_base(regs->cs);
1883#else
1884 if (test_thread_flag(TIF_IA32)) {
1885 if (user_mode(regs) && regs->cs != __USER32_CS)
1886 return get_segment_base(regs->cs);
1887 }
1888#endif
1889 return 0;
1890}
1822 1891
1892unsigned long perf_instruction_pointer(struct pt_regs *regs)
1893{
1823 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) 1894 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1824 ip = perf_guest_cbs->get_guest_ip(); 1895 return perf_guest_cbs->get_guest_ip();
1825 else
1826 ip = instruction_pointer(regs);
1827 1896
1828 return ip; 1897 return regs->ip + code_segment_base(regs);
1829} 1898}
1830 1899
1831unsigned long perf_misc_flags(struct pt_regs *regs) 1900unsigned long perf_misc_flags(struct pt_regs *regs)
@@ -1838,7 +1907,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1838 else 1907 else
1839 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 1908 misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1840 } else { 1909 } else {
1841 if (!kernel_ip(regs->ip)) 1910 if (user_mode(regs))
1842 misc |= PERF_RECORD_MISC_USER; 1911 misc |= PERF_RECORD_MISC_USER;
1843 else 1912 else
1844 misc |= PERF_RECORD_MISC_KERNEL; 1913 misc |= PERF_RECORD_MISC_KERNEL;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 821d53b696d1..6605a81ba339 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -516,6 +516,26 @@ static inline bool kernel_ip(unsigned long ip)
516#endif 516#endif
517} 517}
518 518
519/*
520 * Not all PMUs provide the right context information to place the reported IP
521 * into full context. Specifically segment registers are typically not
522 * supplied.
523 *
524 * Assuming the address is a linear address (it is for IBS), we fake the CS and
525 * vm86 mode using the known zero-based code segment and 'fix up' the registers
526 * to reflect this.
527 *
528 * Intel PEBS/LBR appear to typically provide the effective address, nothing
529 * much we can do about that but pray and treat it like a linear address.
530 */
531static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
532{
533 regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
534 if (regs->flags & X86_VM_MASK)
535 regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
536 regs->ip = ip;
537}
538
519#ifdef CONFIG_CPU_SUP_AMD 539#ifdef CONFIG_CPU_SUP_AMD
520 540
521int amd_pmu_init(void); 541int amd_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index da9bcdcd9856..7bfb5bec8630 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -13,6 +13,8 @@
13 13
14#include <asm/apic.h> 14#include <asm/apic.h>
15 15
16#include "perf_event.h"
17
16static u32 ibs_caps; 18static u32 ibs_caps;
17 19
18#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 20#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
@@ -536,7 +538,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
536 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { 538 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
537 regs.flags &= ~PERF_EFLAGS_EXACT; 539 regs.flags &= ~PERF_EFLAGS_EXACT;
538 } else { 540 } else {
539 instruction_pointer_set(&regs, ibs_data.regs[1]); 541 set_linear_ip(&regs, ibs_data.regs[1]);
540 regs.flags |= PERF_EFLAGS_EXACT; 542 regs.flags |= PERF_EFLAGS_EXACT;
541 } 543 }
542 544
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 382366977d4c..7f2739e03e79 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1522,8 +1522,16 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
1522 arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; 1522 arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
1523 arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; 1523 arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
1524 arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; 1524 arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
1525 /*
1526 * If PMU counter has PEBS enabled it is not enough to disable counter
1527 * on a guest entry since PEBS memory write can overshoot guest entry
1528 * and corrupt guest memory. Disabling PEBS solves the problem.
1529 */
1530 arr[1].msr = MSR_IA32_PEBS_ENABLE;
1531 arr[1].host = cpuc->pebs_enabled;
1532 arr[1].guest = 0;
1525 1533
1526 *nr = 1; 1534 *nr = 2;
1527 return arr; 1535 return arr;
1528} 1536}
1529 1537
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 629ae0b7ad90..e38d97bf4259 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -499,7 +499,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
499 * We sampled a branch insn, rewind using the LBR stack 499 * We sampled a branch insn, rewind using the LBR stack
500 */ 500 */
501 if (ip == to) { 501 if (ip == to) {
502 regs->ip = from; 502 set_linear_ip(regs, from);
503 return 1; 503 return 1;
504 } 504 }
505 505
@@ -529,7 +529,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
529 } while (to < ip); 529 } while (to < ip);
530 530
531 if (to == ip) { 531 if (to == ip) {
532 regs->ip = old_to; 532 set_linear_ip(regs, old_to);
533 return 1; 533 return 1;
534 } 534 }
535 535
@@ -569,7 +569,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
569 * A possible PERF_SAMPLE_REGS will have to transfer all regs. 569 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
570 */ 570 */
571 regs = *iregs; 571 regs = *iregs;
572 regs.ip = pebs->ip; 572 regs.flags = pebs->flags;
573 set_linear_ip(&regs, pebs->ip);
573 regs.bp = pebs->bp; 574 regs.bp = pebs->bp;
574 regs.sp = pebs->sp; 575 regs.sp = pebs->sp;
575 576
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 7563fda9f033..0a5571080e74 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -796,7 +796,6 @@ static struct intel_uncore_type *nhm_msr_uncores[] = {
796 796
797DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); 797DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
798DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); 798DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
799DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63");
800DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); 799DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
801DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); 800DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
802 801
@@ -902,16 +901,21 @@ static struct attribute_group nhmex_uncore_cbox_format_group = {
902 .attrs = nhmex_uncore_cbox_formats_attr, 901 .attrs = nhmex_uncore_cbox_formats_attr,
903}; 902};
904 903
904/* msr offset for each instance of cbox */
905static unsigned nhmex_cbox_msr_offsets[] = {
906 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
907};
908
905static struct intel_uncore_type nhmex_uncore_cbox = { 909static struct intel_uncore_type nhmex_uncore_cbox = {
906 .name = "cbox", 910 .name = "cbox",
907 .num_counters = 6, 911 .num_counters = 6,
908 .num_boxes = 8, 912 .num_boxes = 10,
909 .perf_ctr_bits = 48, 913 .perf_ctr_bits = 48,
910 .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, 914 .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
911 .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, 915 .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
912 .event_mask = NHMEX_PMON_RAW_EVENT_MASK, 916 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
913 .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, 917 .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
914 .msr_offset = NHMEX_C_MSR_OFFSET, 918 .msr_offsets = nhmex_cbox_msr_offsets,
915 .pair_ctr_ctl = 1, 919 .pair_ctr_ctl = 1,
916 .ops = &nhmex_uncore_ops, 920 .ops = &nhmex_uncore_ops,
917 .format_group = &nhmex_uncore_cbox_format_group 921 .format_group = &nhmex_uncore_cbox_format_group
@@ -1032,24 +1036,22 @@ static struct intel_uncore_type nhmex_uncore_bbox = {
1032 1036
1033static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) 1037static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1034{ 1038{
1035 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; 1039 struct hw_perf_event *hwc = &event->hw;
1036 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; 1040 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1041 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1037 1042
1038 if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { 1043 /* only TO_R_PROG_EV event uses the match/mask register */
1039 reg1->config = event->attr.config1; 1044 if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
1040 reg2->config = event->attr.config2; 1045 NHMEX_S_EVENT_TO_R_PROG_EV)
1041 } else { 1046 return 0;
1042 reg1->config = ~0ULL;
1043 reg2->config = ~0ULL;
1044 }
1045 1047
1046 if (box->pmu->pmu_idx == 0) 1048 if (box->pmu->pmu_idx == 0)
1047 reg1->reg = NHMEX_S0_MSR_MM_CFG; 1049 reg1->reg = NHMEX_S0_MSR_MM_CFG;
1048 else 1050 else
1049 reg1->reg = NHMEX_S1_MSR_MM_CFG; 1051 reg1->reg = NHMEX_S1_MSR_MM_CFG;
1050
1051 reg1->idx = 0; 1052 reg1->idx = 0;
1052 1053 reg1->config = event->attr.config1;
1054 reg2->config = event->attr.config2;
1053 return 0; 1055 return 0;
1054} 1056}
1055 1057
@@ -1059,8 +1061,8 @@ static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct per
1059 struct hw_perf_event_extra *reg1 = &hwc->extra_reg; 1061 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1060 struct hw_perf_event_extra *reg2 = &hwc->branch_reg; 1062 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1061 1063
1062 wrmsrl(reg1->reg, 0); 1064 if (reg1->idx != EXTRA_REG_NONE) {
1063 if (reg1->config != ~0ULL || reg2->config != ~0ULL) { 1065 wrmsrl(reg1->reg, 0);
1064 wrmsrl(reg1->reg + 1, reg1->config); 1066 wrmsrl(reg1->reg + 1, reg1->config);
1065 wrmsrl(reg1->reg + 2, reg2->config); 1067 wrmsrl(reg1->reg + 2, reg2->config);
1066 wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); 1068 wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
@@ -1074,7 +1076,6 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
1074 &format_attr_edge.attr, 1076 &format_attr_edge.attr,
1075 &format_attr_inv.attr, 1077 &format_attr_inv.attr,
1076 &format_attr_thresh8.attr, 1078 &format_attr_thresh8.attr,
1077 &format_attr_mm_cfg.attr,
1078 &format_attr_match.attr, 1079 &format_attr_match.attr,
1079 &format_attr_mask.attr, 1080 &format_attr_mask.attr,
1080 NULL, 1081 NULL,
@@ -1142,6 +1143,9 @@ static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
1142 EVENT_EXTRA_END 1143 EVENT_EXTRA_END
1143}; 1144};
1144 1145
1146/* Nehalem-EX or Westmere-EX ? */
1147bool uncore_nhmex;
1148
1145static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) 1149static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
1146{ 1150{
1147 struct intel_uncore_extra_reg *er; 1151 struct intel_uncore_extra_reg *er;
@@ -1171,18 +1175,29 @@ static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64
1171 return false; 1175 return false;
1172 1176
1173 /* mask of the shared fields */ 1177 /* mask of the shared fields */
1174 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; 1178 if (uncore_nhmex)
1179 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
1180 else
1181 mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
1175 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; 1182 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1176 1183
1177 raw_spin_lock_irqsave(&er->lock, flags); 1184 raw_spin_lock_irqsave(&er->lock, flags);
1178 /* add mask of the non-shared field if it's in use */ 1185 /* add mask of the non-shared field if it's in use */
1179 if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) 1186 if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
1180 mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); 1187 if (uncore_nhmex)
1188 mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1189 else
1190 mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1191 }
1181 1192
1182 if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { 1193 if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
1183 atomic_add(1 << (idx * 8), &er->ref); 1194 atomic_add(1 << (idx * 8), &er->ref);
1184 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | 1195 if (uncore_nhmex)
1185 NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); 1196 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
1197 NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1198 else
1199 mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
1200 WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1186 er->config &= ~mask; 1201 er->config &= ~mask;
1187 er->config |= (config & mask); 1202 er->config |= (config & mask);
1188 ret = true; 1203 ret = true;
@@ -1216,7 +1231,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
1216 1231
1217 /* get the non-shared control bits and shift them */ 1232 /* get the non-shared control bits and shift them */
1218 idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; 1233 idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1219 config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); 1234 if (uncore_nhmex)
1235 config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1236 else
1237 config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1220 if (new_idx > orig_idx) { 1238 if (new_idx > orig_idx) {
1221 idx = new_idx - orig_idx; 1239 idx = new_idx - orig_idx;
1222 config <<= 3 * idx; 1240 config <<= 3 * idx;
@@ -1226,6 +1244,10 @@ u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
1226 } 1244 }
1227 1245
1228 /* add the shared control bits back */ 1246 /* add the shared control bits back */
1247 if (uncore_nhmex)
1248 config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1249 else
1250 config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1229 config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; 1251 config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1230 if (modify) { 1252 if (modify) {
1231 /* adjust the main event selector */ 1253 /* adjust the main event selector */
@@ -1264,7 +1286,8 @@ again:
1264 } 1286 }
1265 1287
1266 /* for the match/mask registers */ 1288 /* for the match/mask registers */
1267 if ((uncore_box_is_fake(box) || !reg2->alloc) && 1289 if (reg2->idx != EXTRA_REG_NONE &&
1290 (uncore_box_is_fake(box) || !reg2->alloc) &&
1268 !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) 1291 !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
1269 goto fail; 1292 goto fail;
1270 1293
@@ -1278,7 +1301,8 @@ again:
1278 if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) 1301 if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
1279 nhmex_mbox_alter_er(event, idx[0], true); 1302 nhmex_mbox_alter_er(event, idx[0], true);
1280 reg1->alloc |= alloc; 1303 reg1->alloc |= alloc;
1281 reg2->alloc = 1; 1304 if (reg2->idx != EXTRA_REG_NONE)
1305 reg2->alloc = 1;
1282 } 1306 }
1283 return NULL; 1307 return NULL;
1284fail: 1308fail:
@@ -1342,9 +1366,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event
1342 struct extra_reg *er; 1366 struct extra_reg *er;
1343 unsigned msr; 1367 unsigned msr;
1344 int reg_idx = 0; 1368 int reg_idx = 0;
1345
1346 if (WARN_ON_ONCE(reg1->idx != -1))
1347 return -EINVAL;
1348 /* 1369 /*
1349 * The mbox events may require 2 extra MSRs at the most. But only 1370 * The mbox events may require 2 extra MSRs at the most. But only
1350 * the lower 32 bits in these MSRs are significant, so we can use 1371 * the lower 32 bits in these MSRs are significant, so we can use
@@ -1355,11 +1376,6 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event
1355 continue; 1376 continue;
1356 if (event->attr.config1 & ~er->valid_mask) 1377 if (event->attr.config1 & ~er->valid_mask)
1357 return -EINVAL; 1378 return -EINVAL;
1358 if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) ||
1359 er->idx == __BITS_VALUE(reg1->idx, 1, 8))
1360 continue;
1361 if (WARN_ON_ONCE(reg_idx >= 2))
1362 return -EINVAL;
1363 1379
1364 msr = er->msr + type->msr_offset * box->pmu->pmu_idx; 1380 msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
1365 if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) 1381 if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
@@ -1368,6 +1384,8 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event
1368 /* always use the 32~63 bits to pass the PLD config */ 1384 /* always use the 32~63 bits to pass the PLD config */
1369 if (er->idx == EXTRA_REG_NHMEX_M_PLD) 1385 if (er->idx == EXTRA_REG_NHMEX_M_PLD)
1370 reg_idx = 1; 1386 reg_idx = 1;
1387 else if (WARN_ON_ONCE(reg_idx > 0))
1388 return -EINVAL;
1371 1389
1372 reg1->idx &= ~(0xff << (reg_idx * 8)); 1390 reg1->idx &= ~(0xff << (reg_idx * 8));
1373 reg1->reg &= ~(0xffff << (reg_idx * 16)); 1391 reg1->reg &= ~(0xffff << (reg_idx * 16));
@@ -1376,17 +1394,21 @@ static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event
1376 reg1->config = event->attr.config1; 1394 reg1->config = event->attr.config1;
1377 reg_idx++; 1395 reg_idx++;
1378 } 1396 }
1379 /* use config2 to pass the filter config */ 1397 /*
1380 reg2->idx = EXTRA_REG_NHMEX_M_FILTER; 1398 * The mbox only provides ability to perform address matching
1381 if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) 1399 * for the PLD events.
1382 reg2->config = event->attr.config2; 1400 */
1383 else 1401 if (reg_idx == 2) {
1384 reg2->config = ~0ULL; 1402 reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
1385 if (box->pmu->pmu_idx == 0) 1403 if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
1386 reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; 1404 reg2->config = event->attr.config2;
1387 else 1405 else
1388 reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; 1406 reg2->config = ~0ULL;
1389 1407 if (box->pmu->pmu_idx == 0)
1408 reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
1409 else
1410 reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
1411 }
1390 return 0; 1412 return 0;
1391} 1413}
1392 1414
@@ -1422,34 +1444,36 @@ static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct per
1422 wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), 1444 wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
1423 nhmex_mbox_shared_reg_config(box, idx)); 1445 nhmex_mbox_shared_reg_config(box, idx));
1424 1446
1425 wrmsrl(reg2->reg, 0); 1447 if (reg2->idx != EXTRA_REG_NONE) {
1426 if (reg2->config != ~0ULL) { 1448 wrmsrl(reg2->reg, 0);
1427 wrmsrl(reg2->reg + 1, 1449 if (reg2->config != ~0ULL) {
1428 reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); 1450 wrmsrl(reg2->reg + 1,
1429 wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & 1451 reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
1430 (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); 1452 wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
1431 wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); 1453 (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
1454 wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
1455 }
1432 } 1456 }
1433 1457
1434 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); 1458 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
1435} 1459}
1436 1460
1437DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); 1461DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
1438DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); 1462DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
1439DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); 1463DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
1440DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); 1464DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
1441DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); 1465DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
1442DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); 1466DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
1443DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); 1467DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
1444DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); 1468DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
1445DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); 1469DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
1446DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); 1470DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
1447DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); 1471DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
1448DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); 1472DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
1449DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); 1473DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
1450DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); 1474DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
1451DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); 1475DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
1452DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); 1476DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
1453 1477
1454static struct attribute *nhmex_uncore_mbox_formats_attr[] = { 1478static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
1455 &format_attr_count_mode.attr, 1479 &format_attr_count_mode.attr,
@@ -1458,7 +1482,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
1458 &format_attr_flag_mode.attr, 1482 &format_attr_flag_mode.attr,
1459 &format_attr_inc_sel.attr, 1483 &format_attr_inc_sel.attr,
1460 &format_attr_set_flag_sel.attr, 1484 &format_attr_set_flag_sel.attr,
1461 &format_attr_filter_cfg.attr, 1485 &format_attr_filter_cfg_en.attr,
1462 &format_attr_filter_match.attr, 1486 &format_attr_filter_match.attr,
1463 &format_attr_filter_mask.attr, 1487 &format_attr_filter_mask.attr,
1464 &format_attr_dsp.attr, 1488 &format_attr_dsp.attr,
@@ -1482,6 +1506,12 @@ static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
1482 { /* end: all zeroes */ }, 1506 { /* end: all zeroes */ },
1483}; 1507};
1484 1508
1509static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
1510 INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
1511 INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
1512 { /* end: all zeroes */ },
1513};
1514
1485static struct intel_uncore_ops nhmex_uncore_mbox_ops = { 1515static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
1486 NHMEX_UNCORE_OPS_COMMON_INIT(), 1516 NHMEX_UNCORE_OPS_COMMON_INIT(),
1487 .enable_event = nhmex_mbox_msr_enable_event, 1517 .enable_event = nhmex_mbox_msr_enable_event,
@@ -1513,7 +1543,7 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
1513 struct hw_perf_event_extra *reg1 = &hwc->extra_reg; 1543 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1514 int port; 1544 int port;
1515 1545
1516 /* adjust the main event selector */ 1546 /* adjust the main event selector and extra register index */
1517 if (reg1->idx % 2) { 1547 if (reg1->idx % 2) {
1518 reg1->idx--; 1548 reg1->idx--;
1519 hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; 1549 hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
@@ -1522,29 +1552,17 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
1522 hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; 1552 hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1523 } 1553 }
1524 1554
1525 /* adjust address or config of extra register */ 1555 /* adjust extra register config */
1526 port = reg1->idx / 6 + box->pmu->pmu_idx * 4; 1556 port = reg1->idx / 6 + box->pmu->pmu_idx * 4;
1527 switch (reg1->idx % 6) { 1557 switch (reg1->idx % 6) {
1528 case 0:
1529 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port);
1530 break;
1531 case 1:
1532 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port);
1533 break;
1534 case 2: 1558 case 2:
1535 /* the 8~15 bits to the 0~7 bits */ 1559 /* shift the 8~15 bits to the 0~7 bits */
1536 reg1->config >>= 8; 1560 reg1->config >>= 8;
1537 break; 1561 break;
1538 case 3: 1562 case 3:
1539 /* the 0~7 bits to the 8~15 bits */ 1563 /* shift the 0~7 bits to the 8~15 bits */
1540 reg1->config <<= 8; 1564 reg1->config <<= 8;
1541 break; 1565 break;
1542 case 4:
1543 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port);
1544 break;
1545 case 5:
1546 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port);
1547 break;
1548 }; 1566 };
1549} 1567}
1550 1568
@@ -1671,7 +1689,7 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event
1671 struct hw_perf_event *hwc = &event->hw; 1689 struct hw_perf_event *hwc = &event->hw;
1672 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; 1690 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1673 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; 1691 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1674 int port, idx; 1692 int idx;
1675 1693
1676 idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> 1694 idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
1677 NHMEX_R_PMON_CTL_EV_SEL_SHIFT; 1695 NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
@@ -1681,27 +1699,11 @@ static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event
1681 reg1->idx = idx; 1699 reg1->idx = idx;
1682 reg1->config = event->attr.config1; 1700 reg1->config = event->attr.config1;
1683 1701
1684 port = idx / 6 + box->pmu->pmu_idx * 4; 1702 switch (idx % 6) {
1685 idx %= 6;
1686 switch (idx) {
1687 case 0:
1688 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port);
1689 break;
1690 case 1:
1691 reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port);
1692 break;
1693 case 2:
1694 case 3:
1695 reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port);
1696 break;
1697 case 4: 1703 case 4:
1698 case 5: 1704 case 5:
1699 if (idx == 4)
1700 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port);
1701 else
1702 reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port);
1703 reg2->config = event->attr.config2;
1704 hwc->config |= event->attr.config & (~0ULL << 32); 1705 hwc->config |= event->attr.config & (~0ULL << 32);
1706 reg2->config = event->attr.config2;
1705 break; 1707 break;
1706 }; 1708 };
1707 return 0; 1709 return 0;
@@ -1727,28 +1729,34 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per
1727 struct hw_perf_event *hwc = &event->hw; 1729 struct hw_perf_event *hwc = &event->hw;
1728 struct hw_perf_event_extra *reg1 = &hwc->extra_reg; 1730 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1729 struct hw_perf_event_extra *reg2 = &hwc->branch_reg; 1731 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1730 int idx, er_idx; 1732 int idx, port;
1731 1733
1732 idx = reg1->idx % 6; 1734 idx = reg1->idx;
1733 er_idx = idx; 1735 port = idx / 6 + box->pmu->pmu_idx * 4;
1734 if (er_idx > 2)
1735 er_idx--;
1736 er_idx += (reg1->idx / 6) * 5;
1737 1736
1738 switch (idx) { 1737 switch (idx % 6) {
1739 case 0: 1738 case 0:
1739 wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
1740 break;
1740 case 1: 1741 case 1:
1741 wrmsrl(reg1->reg, reg1->config); 1742 wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
1742 break; 1743 break;
1743 case 2: 1744 case 2:
1744 case 3: 1745 case 3:
1745 wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); 1746 wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
1747 nhmex_rbox_shared_reg_config(box, 2 + (idx / 6) * 5));
1746 break; 1748 break;
1747 case 4: 1749 case 4:
1750 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
1751 hwc->config >> 32);
1752 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
1753 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
1754 break;
1748 case 5: 1755 case 5:
1749 wrmsrl(reg1->reg, reg1->config); 1756 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
1750 wrmsrl(reg1->reg + 1, hwc->config >> 32); 1757 hwc->config >> 32);
1751 wrmsrl(reg1->reg + 2, reg2->config); 1758 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
1759 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
1752 break; 1760 break;
1753 }; 1761 };
1754 1762
@@ -1756,8 +1764,8 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per
1756 (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); 1764 (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
1757} 1765}
1758 1766
1759DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); 1767DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
1760DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); 1768DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
1761DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); 1769DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
1762DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); 1770DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
1763DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); 1771DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
@@ -2303,6 +2311,7 @@ int uncore_pmu_event_init(struct perf_event *event)
2303 event->hw.idx = -1; 2311 event->hw.idx = -1;
2304 event->hw.last_tag = ~0ULL; 2312 event->hw.last_tag = ~0ULL;
2305 event->hw.extra_reg.idx = EXTRA_REG_NONE; 2313 event->hw.extra_reg.idx = EXTRA_REG_NONE;
2314 event->hw.branch_reg.idx = EXTRA_REG_NONE;
2306 2315
2307 if (event->attr.config == UNCORE_FIXED_EVENT) { 2316 if (event->attr.config == UNCORE_FIXED_EVENT) {
2308 /* no fixed counter */ 2317 /* no fixed counter */
@@ -2373,7 +2382,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
2373 type->attr_groups[1] = NULL; 2382 type->attr_groups[1] = NULL;
2374} 2383}
2375 2384
2376static void uncore_types_exit(struct intel_uncore_type **types) 2385static void __init uncore_types_exit(struct intel_uncore_type **types)
2377{ 2386{
2378 int i; 2387 int i;
2379 for (i = 0; types[i]; i++) 2388 for (i = 0; types[i]; i++)
@@ -2814,7 +2823,13 @@ static int __init uncore_cpu_init(void)
2814 snbep_uncore_cbox.num_boxes = max_cores; 2823 snbep_uncore_cbox.num_boxes = max_cores;
2815 msr_uncores = snbep_msr_uncores; 2824 msr_uncores = snbep_msr_uncores;
2816 break; 2825 break;
2817 case 46: 2826 case 46: /* Nehalem-EX */
2827 uncore_nhmex = true;
2828 case 47: /* Westmere-EX aka. Xeon E7 */
2829 if (!uncore_nhmex)
2830 nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
2831 if (nhmex_uncore_cbox.num_boxes > max_cores)
2832 nhmex_uncore_cbox.num_boxes = max_cores;
2818 msr_uncores = nhmex_msr_uncores; 2833 msr_uncores = nhmex_msr_uncores;
2819 break; 2834 break;
2820 default: 2835 default:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index f3851892e077..5b81c1856aac 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -5,7 +5,7 @@
5#include "perf_event.h" 5#include "perf_event.h"
6 6
7#define UNCORE_PMU_NAME_LEN 32 7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) 8#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
9 9
10#define UNCORE_FIXED_EVENT 0xff 10#define UNCORE_FIXED_EVENT 0xff
11#define UNCORE_PMC_IDX_MAX_GENERIC 8 11#define UNCORE_PMC_IDX_MAX_GENERIC 8
@@ -230,6 +230,7 @@
230#define NHMEX_S1_MSR_MASK 0xe5a 230#define NHMEX_S1_MSR_MASK 0xe5a
231 231
232#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) 232#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
233#define NHMEX_S_EVENT_TO_R_PROG_EV 0
233 234
234/* NHM-EX Mbox */ 235/* NHM-EX Mbox */
235#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 236#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
@@ -275,18 +276,12 @@
275 NHMEX_M_PMON_CTL_INC_SEL_MASK | \ 276 NHMEX_M_PMON_CTL_INC_SEL_MASK | \
276 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) 277 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
277 278
278 279#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
279#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f
280#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5)
281#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8)
282#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23)
283#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \
284 (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \
285 NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \
286 NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \
287 NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR)
288#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) 280#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n)))
289 281
282#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
283#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n)))
284
290/* 285/*
291 * use the 9~13 bits to select event If the 7th bit is not set, 286 * use the 9~13 bits to select event If the 7th bit is not set,
292 * otherwise use the 19~21 bits to select event. 287 * otherwise use the 19~21 bits to select event.
@@ -368,6 +363,7 @@ struct intel_uncore_type {
368 unsigned num_shared_regs:8; 363 unsigned num_shared_regs:8;
369 unsigned single_fixed:1; 364 unsigned single_fixed:1;
370 unsigned pair_ctr_ctl:1; 365 unsigned pair_ctr_ctl:1;
366 unsigned *msr_offsets;
371 struct event_constraint unconstrainted; 367 struct event_constraint unconstrainted;
372 struct event_constraint *constraints; 368 struct event_constraint *constraints;
373 struct intel_uncore_pmu *pmus; 369 struct intel_uncore_pmu *pmus;
@@ -485,29 +481,31 @@ unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
485 return idx * 8 + box->pmu->type->perf_ctr; 481 return idx * 8 + box->pmu->type->perf_ctr;
486} 482}
487 483
488static inline 484static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
489unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) 485{
486 struct intel_uncore_pmu *pmu = box->pmu;
487 return pmu->type->msr_offsets ?
488 pmu->type->msr_offsets[pmu->pmu_idx] :
489 pmu->type->msr_offset * pmu->pmu_idx;
490}
491
492static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
490{ 493{
491 if (!box->pmu->type->box_ctl) 494 if (!box->pmu->type->box_ctl)
492 return 0; 495 return 0;
493 return box->pmu->type->box_ctl + 496 return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
494 box->pmu->type->msr_offset * box->pmu->pmu_idx;
495} 497}
496 498
497static inline 499static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
498unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
499{ 500{
500 if (!box->pmu->type->fixed_ctl) 501 if (!box->pmu->type->fixed_ctl)
501 return 0; 502 return 0;
502 return box->pmu->type->fixed_ctl + 503 return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
503 box->pmu->type->msr_offset * box->pmu->pmu_idx;
504} 504}
505 505
506static inline 506static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
507unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
508{ 507{
509 return box->pmu->type->fixed_ctr + 508 return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
510 box->pmu->type->msr_offset * box->pmu->pmu_idx;
511} 509}
512 510
513static inline 511static inline
@@ -515,7 +513,7 @@ unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
515{ 513{
516 return box->pmu->type->event_ctl + 514 return box->pmu->type->event_ctl +
517 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + 515 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
518 box->pmu->type->msr_offset * box->pmu->pmu_idx; 516 uncore_msr_box_offset(box);
519} 517}
520 518
521static inline 519static inline
@@ -523,7 +521,7 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
523{ 521{
524 return box->pmu->type->perf_ctr + 522 return box->pmu->type->perf_ctr +
525 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + 523 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
526 box->pmu->type->msr_offset * box->pmu->pmu_idx; 524 uncore_msr_box_offset(box);
527} 525}
528 526
529static inline 527static inline
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1f5f1d5d2a02..d44f7829968e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -270,7 +270,7 @@ void fixup_irqs(void)
270 270
271 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 271 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
272 break_affinity = 1; 272 break_affinity = 1;
273 affinity = cpu_all_mask; 273 affinity = cpu_online_mask;
274 } 274 }
275 275
276 chip = irq_data_get_irq_chip(data); 276 chip = irq_data_get_irq_chip(data);
@@ -328,6 +328,7 @@ void fixup_irqs(void)
328 chip->irq_retrigger(data); 328 chip->irq_retrigger(data);
329 raw_spin_unlock(&desc->lock); 329 raw_spin_unlock(&desc->lock);
330 } 330 }
331 __this_cpu_write(vector_irq[vector], -1);
331 } 332 }
332} 333}
333#endif 334#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 1d5d31ea686b..dc1404bf8e4b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -107,7 +107,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
107{ 107{
108 struct setup_data_node *node; 108 struct setup_data_node *node;
109 struct setup_data *data; 109 struct setup_data *data;
110 int error = -ENOMEM; 110 int error;
111 struct dentry *d; 111 struct dentry *d;
112 struct page *pg; 112 struct page *pg;
113 u64 pa_data; 113 u64 pa_data;
@@ -121,8 +121,10 @@ static int __init create_setup_data_nodes(struct dentry *parent)
121 121
122 while (pa_data) { 122 while (pa_data) {
123 node = kmalloc(sizeof(*node), GFP_KERNEL); 123 node = kmalloc(sizeof(*node), GFP_KERNEL);
124 if (!node) 124 if (!node) {
125 error = -ENOMEM;
125 goto err_dir; 126 goto err_dir;
127 }
126 128
127 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 129 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
128 if (PageHighMem(pg)) { 130 if (PageHighMem(pg)) {
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 8a2ce8fd41c0..82746f942cd8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
143 unsigned int *current_size) 143 unsigned int *current_size)
144{ 144{
145 struct microcode_header_amd *mc_hdr; 145 struct microcode_header_amd *mc_hdr;
146 unsigned int actual_size; 146 unsigned int actual_size, patch_size;
147 u16 equiv_cpu_id; 147 u16 equiv_cpu_id;
148 148
149 /* size of the current patch we're staring at */ 149 /* size of the current patch we're staring at */
150 *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; 150 patch_size = *(u32 *)(ucode_ptr + 4);
151 *current_size = patch_size + SECTION_HDR_SIZE;
151 152
152 equiv_cpu_id = find_equiv_id(); 153 equiv_cpu_id = find_equiv_id();
153 if (!equiv_cpu_id) 154 if (!equiv_cpu_id)
@@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
174 /* 175 /*
175 * now that the header looks sane, verify its size 176 * now that the header looks sane, verify its size
176 */ 177 */
177 actual_size = verify_ucode_size(cpu, *current_size, leftover_size); 178 actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
178 if (!actual_size) 179 if (!actual_size)
179 return 0; 180 return 0;
180 181
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97d9a9914ba8..a3b57a27be88 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
475 return address_mask(ctxt, reg); 475 return address_mask(ctxt, reg);
476} 476}
477 477
478static void masked_increment(ulong *reg, ulong mask, int inc)
479{
480 assign_masked(reg, *reg + inc, mask);
481}
482
478static inline void 483static inline void
479register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) 484register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
480{ 485{
486 ulong mask;
487
481 if (ctxt->ad_bytes == sizeof(unsigned long)) 488 if (ctxt->ad_bytes == sizeof(unsigned long))
482 *reg += inc; 489 mask = ~0UL;
483 else 490 else
484 *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); 491 mask = ad_mask(ctxt);
492 masked_increment(reg, mask, inc);
493}
494
495static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
496{
497 masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
485} 498}
486 499
487static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 500static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1522{ 1535{
1523 struct segmented_address addr; 1536 struct segmented_address addr;
1524 1537
1525 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); 1538 rsp_increment(ctxt, -bytes);
1526 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1539 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
1527 addr.seg = VCPU_SREG_SS; 1540 addr.seg = VCPU_SREG_SS;
1528 1541
1529 return segmented_write(ctxt, addr, data, bytes); 1542 return segmented_write(ctxt, addr, data, bytes);
@@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1542 int rc; 1555 int rc;
1543 struct segmented_address addr; 1556 struct segmented_address addr;
1544 1557
1545 addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); 1558 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
1546 addr.seg = VCPU_SREG_SS; 1559 addr.seg = VCPU_SREG_SS;
1547 rc = segmented_read(ctxt, addr, dest, len); 1560 rc = segmented_read(ctxt, addr, dest, len);
1548 if (rc != X86EMUL_CONTINUE) 1561 if (rc != X86EMUL_CONTINUE)
1549 return rc; 1562 return rc;
1550 1563
1551 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); 1564 rsp_increment(ctxt, len);
1552 return rc; 1565 return rc;
1553} 1566}
1554 1567
@@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1688 1701
1689 while (reg >= VCPU_REGS_RAX) { 1702 while (reg >= VCPU_REGS_RAX) {
1690 if (reg == VCPU_REGS_RSP) { 1703 if (reg == VCPU_REGS_RSP) {
1691 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], 1704 rsp_increment(ctxt, ctxt->op_bytes);
1692 ctxt->op_bytes);
1693 --reg; 1705 --reg;
1694 } 1706 }
1695 1707
@@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2825 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); 2837 rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
2826 if (rc != X86EMUL_CONTINUE) 2838 if (rc != X86EMUL_CONTINUE)
2827 return rc; 2839 return rc;
2828 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); 2840 rsp_increment(ctxt, ctxt->src.val);
2829 return X86EMUL_CONTINUE; 2841 return X86EMUL_CONTINUE;
2830} 2842}
2831 2843
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1df8fb9e1d5d..e498b18f010c 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -316,6 +316,11 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
316 addr &= 1; 316 addr &= 1;
317 if (addr == 0) { 317 if (addr == 0) {
318 if (val & 0x10) { 318 if (val & 0x10) {
319 u8 edge_irr = s->irr & ~s->elcr;
320 int i;
321 bool found;
322 struct kvm_vcpu *vcpu;
323
319 s->init4 = val & 1; 324 s->init4 = val & 1;
320 s->last_irr = 0; 325 s->last_irr = 0;
321 s->irr &= s->elcr; 326 s->irr &= s->elcr;
@@ -333,6 +338,18 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
333 if (val & 0x08) 338 if (val & 0x08)
334 pr_pic_unimpl( 339 pr_pic_unimpl(
335 "level sensitive irq not supported"); 340 "level sensitive irq not supported");
341
342 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
343 if (kvm_apic_accept_pic_intr(vcpu)) {
344 found = true;
345 break;
346 }
347
348
349 if (found)
350 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
351 if (edge_irr & (1 << irq))
352 pic_clear_isr(s, irq);
336 } else if (val & 0x08) { 353 } else if (val & 0x08) {
337 if (val & 0x04) 354 if (val & 0x04)
338 s->poll = 1; 355 s->poll = 1;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01ca00423938..7fbd0d273ea8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4113,16 +4113,21 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4113 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
4114 4114
4115 /* 4115 /*
4116 * Never scan more than sc->nr_to_scan VM instances.
4117 * Will not hit this condition practically since we do not try
4118 * to shrink more than one VM and it is very unlikely to see
4119 * !n_used_mmu_pages so many times.
4120 */
4121 if (!nr_to_scan--)
4122 break;
4123 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 4124 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not 4125 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU 4126 * want to shrink a VM that only started to populate its MMU
4119 * anyway. 4127 * anyway.
4120 */ 4128 */
4121 if (kvm->arch.n_used_mmu_pages > 0) { 4129 if (!kvm->arch.n_used_mmu_pages)
4122 if (!nr_to_scan--)
4123 break;
4124 continue; 4130 continue;
4125 }
4126 4131
4127 idx = srcu_read_lock(&kvm->srcu); 4132 idx = srcu_read_lock(&kvm->srcu);
4128 spin_lock(&kvm->mmu_lock); 4133 spin_lock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c39b60707e02..c00f03de1b79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1488,13 +1488,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1488 loadsegment(ds, vmx->host_state.ds_sel); 1488 loadsegment(ds, vmx->host_state.ds_sel);
1489 loadsegment(es, vmx->host_state.es_sel); 1489 loadsegment(es, vmx->host_state.es_sel);
1490 } 1490 }
1491#else
1492 /*
1493 * The sysexit path does not restore ds/es, so we must set them to
1494 * a reasonable value ourselves.
1495 */
1496 loadsegment(ds, __USER_DS);
1497 loadsegment(es, __USER_DS);
1498#endif 1491#endif
1499 reload_tss(); 1492 reload_tss();
1500#ifdef CONFIG_X86_64 1493#ifdef CONFIG_X86_64
@@ -6370,6 +6363,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6370#endif 6363#endif
6371 ); 6364 );
6372 6365
6366#ifndef CONFIG_X86_64
6367 /*
6368 * The sysexit path does not restore ds/es, so we must set them to
6369 * a reasonable value ourselves.
6370 *
6371 * We can't defer this to vmx_load_host_state() since that function
6372 * may be executed in interrupt context, which saves and restore segments
6373 * around it, nullifying its effect.
6374 */
6375 loadsegment(ds, __USER_DS);
6376 loadsegment(es, __USER_DS);
6377#endif
6378
6373 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 6379 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
6374 | (1 << VCPU_EXREG_RFLAGS) 6380 | (1 << VCPU_EXREG_RFLAGS)
6375 | (1 << VCPU_EXREG_CPL) 6381 | (1 << VCPU_EXREG_CPL)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59b59508ff07..148ed666e311 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
806 * kvm-specific. Those are put in the beginning of the list. 806 * kvm-specific. Those are put in the beginning of the list.
807 */ 807 */
808 808
809#define KVM_SAVE_MSRS_BEGIN 9 809#define KVM_SAVE_MSRS_BEGIN 10
810static u32 msrs_to_save[] = { 810static u32 msrs_to_save[] = {
811 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 811 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 812 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
@@ -925,6 +925,10 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
925 */ 925 */
926 getboottime(&boot); 926 getboottime(&boot);
927 927
928 if (kvm->arch.kvmclock_offset) {
929 struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
930 boot = timespec_sub(boot, ts);
931 }
928 wc.sec = boot.tv_sec; 932 wc.sec = boot.tv_sec;
929 wc.nsec = boot.tv_nsec; 933 wc.nsec = boot.tv_nsec;
930 wc.version = version; 934 wc.version = version;
@@ -1996,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1996 case MSR_KVM_STEAL_TIME: 2000 case MSR_KVM_STEAL_TIME:
1997 data = vcpu->arch.st.msr_val; 2001 data = vcpu->arch.st.msr_val;
1998 break; 2002 break;
2003 case MSR_KVM_PV_EOI_EN:
2004 data = vcpu->arch.pv_eoi.msr_val;
2005 break;
1999 case MSR_IA32_P5_MC_ADDR: 2006 case MSR_IA32_P5_MC_ADDR:
2000 case MSR_IA32_P5_MC_TYPE: 2007 case MSR_IA32_P5_MC_TYPE:
2001 case MSR_IA32_MCG_CAP: 2008 case MSR_IA32_MCG_CAP:
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f6679a7fb8ca..b91e48512425 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
56} 56}
57 57
58/* 58/*
59 * search for a shareable pmd page for hugetlb. 59 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
60 * and returns the corresponding pte. While this is not necessary for the
61 * !shared pmd case because we can allocate the pmd later as well, it makes the
62 * code much cleaner. pmd allocation is essential for the shared case because
63 * pud has to be populated inside the same i_mmap_mutex section - otherwise
64 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
65 * bad pmd for sharing.
60 */ 66 */
61static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 67static pte_t *
68huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
62{ 69{
63 struct vm_area_struct *vma = find_vma(mm, addr); 70 struct vm_area_struct *vma = find_vma(mm, addr);
64 struct address_space *mapping = vma->vm_file->f_mapping; 71 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
68 struct vm_area_struct *svma; 75 struct vm_area_struct *svma;
69 unsigned long saddr; 76 unsigned long saddr;
70 pte_t *spte = NULL; 77 pte_t *spte = NULL;
78 pte_t *pte;
71 79
72 if (!vma_shareable(vma, addr)) 80 if (!vma_shareable(vma, addr))
73 return; 81 return (pte_t *)pmd_alloc(mm, pud, addr);
74 82
75 mutex_lock(&mapping->i_mmap_mutex); 83 mutex_lock(&mapping->i_mmap_mutex);
76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 84 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
97 put_page(virt_to_page(spte)); 105 put_page(virt_to_page(spte));
98 spin_unlock(&mm->page_table_lock); 106 spin_unlock(&mm->page_table_lock);
99out: 107out:
108 pte = (pte_t *)pmd_alloc(mm, pud, addr);
100 mutex_unlock(&mapping->i_mmap_mutex); 109 mutex_unlock(&mapping->i_mmap_mutex);
110 return pte;
101} 111}
102 112
103/* 113/*
@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
142 } else { 152 } else {
143 BUG_ON(sz != PMD_SIZE); 153 BUG_ON(sz != PMD_SIZE);
144 if (pud_none(*pud)) 154 if (pud_none(*pud))
145 huge_pmd_share(mm, addr, pud); 155 pte = huge_pmd_share(mm, addr, pud);
146 pte = (pte_t *) pmd_alloc(mm, pud, addr); 156 else
157 pte = (pte_t *)pmd_alloc(mm, pud, addr);
147 } 158 }
148 } 159 }
149 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 160 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 931930a96160..a718e0d23503 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
919 919
920 /* 920 /*
921 * On success we use clflush, when the CPU supports it to 921 * On success we use clflush, when the CPU supports it to
922 * avoid the wbindv. If the CPU does not support it, in the 922 * avoid the wbindv. If the CPU does not support it and in the
923 * error case, and during early boot (for EFI) we fall back 923 * error case we fall back to cpa_flush_all (which uses
924 * to cpa_flush_all (which uses wbinvd): 924 * wbindv):
925 */ 925 */
926 if (early_boot_irqs_disabled) 926 if (!ret && cpu_has_clflush) {
927 __cpa_flush_all((void *)(long)cache);
928 else if (!ret && cpu_has_clflush) {
929 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 927 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
930 cpa_flush_array(addr, numpages, cache, 928 cpa_flush_array(addr, numpages, cache,
931 cpa.flags, pages); 929 cpa.flags, pages);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 4599c3e8bcb6..4ddf497ca65b 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -142,23 +142,23 @@ static inline int save_add_info(void) {return 0;}
142#endif 142#endif
143 143
144/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 144/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
145void __init 145int __init
146acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 146acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
147{ 147{
148 u64 start, end; 148 u64 start, end;
149 int node, pxm; 149 int node, pxm;
150 150
151 if (srat_disabled()) 151 if (srat_disabled())
152 return; 152 return -1;
153 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { 153 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
154 bad_srat(); 154 bad_srat();
155 return; 155 return -1;
156 } 156 }
157 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) 157 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
158 return; 158 return -1;
159 159
160 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) 160 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
161 return; 161 return -1;
162 start = ma->base_address; 162 start = ma->base_address;
163 end = start + ma->length; 163 end = start + ma->length;
164 pxm = ma->proximity_domain; 164 pxm = ma->proximity_domain;
@@ -168,12 +168,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
168 if (node < 0) { 168 if (node < 0) {
169 printk(KERN_ERR "SRAT: Too many proximity domains.\n"); 169 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
170 bad_srat(); 170 bad_srat();
171 return; 171 return -1;
172 } 172 }
173 173
174 if (numa_add_memblk(node, start, end) < 0) { 174 if (numa_add_memblk(node, start, end) < 0) {
175 bad_srat(); 175 bad_srat();
176 return; 176 return -1;
177 } 177 }
178 178
179 node_set(node, numa_nodes_parsed); 179 node_set(node, numa_nodes_parsed);
@@ -181,6 +181,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
181 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", 181 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
182 node, pxm, 182 node, pxm,
183 (unsigned long long) start, (unsigned long long) end - 1); 183 (unsigned long long) start, (unsigned long long) end - 1);
184 return 0;
184} 185}
185 186
186void __init acpi_numa_arch_fixup(void) {} 187void __init acpi_numa_arch_fixup(void) {}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 2dc29f51e75a..92660edaa1e7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -234,7 +234,22 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
234 return status; 234 return status;
235} 235}
236 236
237static int efi_set_rtc_mmss(unsigned long nowtime) 237static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
238 efi_time_cap_t *tc)
239{
240 unsigned long flags;
241 efi_status_t status;
242
243 spin_lock_irqsave(&rtc_lock, flags);
244 efi_call_phys_prelog();
245 status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
246 virt_to_phys(tc));
247 efi_call_phys_epilog();
248 spin_unlock_irqrestore(&rtc_lock, flags);
249 return status;
250}
251
252int efi_set_rtc_mmss(unsigned long nowtime)
238{ 253{
239 int real_seconds, real_minutes; 254 int real_seconds, real_minutes;
240 efi_status_t status; 255 efi_status_t status;
@@ -263,7 +278,7 @@ static int efi_set_rtc_mmss(unsigned long nowtime)
263 return 0; 278 return 0;
264} 279}
265 280
266static unsigned long efi_get_time(void) 281unsigned long efi_get_time(void)
267{ 282{
268 efi_status_t status; 283 efi_status_t status;
269 efi_time_t eft; 284 efi_time_t eft;
@@ -606,13 +621,18 @@ static int __init efi_runtime_init(void)
606 } 621 }
607 /* 622 /*
608 * We will only need *early* access to the following 623 * We will only need *early* access to the following
609 * EFI runtime service before set_virtual_address_map 624 * two EFI runtime services before set_virtual_address_map
610 * is invoked. 625 * is invoked.
611 */ 626 */
627 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
612 efi_phys.set_virtual_address_map = 628 efi_phys.set_virtual_address_map =
613 (efi_set_virtual_address_map_t *) 629 (efi_set_virtual_address_map_t *)
614 runtime->set_virtual_address_map; 630 runtime->set_virtual_address_map;
615 631 /*
632 * Make efi_get_time can be called before entering
633 * virtual mode.
634 */
635 efi.get_time = phys_efi_get_time;
616 early_iounmap(runtime, sizeof(efi_runtime_services_t)); 636 early_iounmap(runtime, sizeof(efi_runtime_services_t));
617 637
618 return 0; 638 return 0;
@@ -700,10 +720,12 @@ void __init efi_init(void)
700 efi_enabled = 0; 720 efi_enabled = 0;
701 return; 721 return;
702 } 722 }
723#ifdef CONFIG_X86_32
703 if (efi_native) { 724 if (efi_native) {
704 x86_platform.get_wallclock = efi_get_time; 725 x86_platform.get_wallclock = efi_get_time;
705 x86_platform.set_wallclock = efi_set_rtc_mmss; 726 x86_platform.set_wallclock = efi_set_rtc_mmss;
706 } 727 }
728#endif
707 729
708#if EFI_DEBUG 730#if EFI_DEBUG
709 print_efi_memmap(); 731 print_efi_memmap();
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index b2d534cab25f..88692871823f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -72,7 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
72 -Wall -Wstrict-prototypes \ 72 -Wall -Wstrict-prototypes \
73 -march=i386 -mregparm=3 \ 73 -march=i386 -mregparm=3 \
74 -include $(srctree)/$(src)/../../boot/code16gcc.h \ 74 -include $(srctree)/$(src)/../../boot/code16gcc.h \
75 -fno-strict-aliasing -fomit-frame-pointer \ 75 -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
76 $(call cc-option, -ffreestanding) \ 76 $(call cc-option, -ffreestanding) \
77 $(call cc-option, -fno-toplevel-reorder,\ 77 $(call cc-option, -fno-toplevel-reorder,\
78 $(call cc-option, -fno-unit-at-a-time)) \ 78 $(call cc-option, -fno-unit-at-a-time)) \
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 51171aeff0dc..a582bfed95bb 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -60,8 +60,8 @@
6051 common getsockname sys_getsockname 6051 common getsockname sys_getsockname
6152 common getpeername sys_getpeername 6152 common getpeername sys_getpeername
6253 common socketpair sys_socketpair 6253 common socketpair sys_socketpair
6354 common setsockopt sys_setsockopt 6354 64 setsockopt sys_setsockopt
6455 common getsockopt sys_getsockopt 6455 64 getsockopt sys_getsockopt
6556 common clone stub_clone 6556 common clone stub_clone
6657 common fork stub_fork 6657 common fork stub_fork
6758 common vfork stub_vfork 6758 common vfork stub_vfork
@@ -318,7 +318,7 @@
318309 common getcpu sys_getcpu 318309 common getcpu sys_getcpu
319310 64 process_vm_readv sys_process_vm_readv 319310 64 process_vm_readv sys_process_vm_readv
320311 64 process_vm_writev sys_process_vm_writev 320311 64 process_vm_writev sys_process_vm_writev
321312 64 kcmp sys_kcmp 321312 common kcmp sys_kcmp
322 322
323# 323#
324# x32-specific system call numbers start at 512 to avoid cache impact 324# x32-specific system call numbers start at 512 to avoid cache impact
@@ -353,3 +353,5 @@
353538 x32 sendmmsg compat_sys_sendmmsg 353538 x32 sendmmsg compat_sys_sendmmsg
354539 x32 process_vm_readv compat_sys_process_vm_readv 354539 x32 process_vm_readv compat_sys_process_vm_readv
355540 x32 process_vm_writev compat_sys_process_vm_writev 355540 x32 process_vm_writev compat_sys_process_vm_writev
356541 x32 setsockopt compat_sys_setsockopt
357542 x32 getsockopt compat_sys_getsockopt
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bf4bda6d3e9a..9642d4a38602 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,7 +31,6 @@
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h> 33#include <linux/memblock.h>
34#include <linux/syscore_ops.h>
35 34
36#include <xen/xen.h> 35#include <xen/xen.h>
37#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
@@ -1470,130 +1469,38 @@ asmlinkage void __init xen_start_kernel(void)
1470#endif 1469#endif
1471} 1470}
1472 1471
1473#ifdef CONFIG_XEN_PVHVM 1472void __ref xen_hvm_init_shared_info(void)
1474/*
1475 * The pfn containing the shared_info is located somewhere in RAM. This
1476 * will cause trouble if the current kernel is doing a kexec boot into a
1477 * new kernel. The new kernel (and its startup code) can not know where
1478 * the pfn is, so it can not reserve the page. The hypervisor will
1479 * continue to update the pfn, and as a result memory corruption occours
1480 * in the new kernel.
1481 *
1482 * One way to work around this issue is to allocate a page in the
1483 * xen-platform pci device's BAR memory range. But pci init is done very
1484 * late and the shared_info page is already in use very early to read
1485 * the pvclock. So moving the pfn from RAM to MMIO is racy because some
1486 * code paths on other vcpus could access the pfn during the small
1487 * window when the old pfn is moved to the new pfn. There is even a
1488 * small window were the old pfn is not backed by a mfn, and during that
1489 * time all reads return -1.
1490 *
1491 * Because it is not known upfront where the MMIO region is located it
1492 * can not be used right from the start in xen_hvm_init_shared_info.
1493 *
1494 * To minimise trouble the move of the pfn is done shortly before kexec.
1495 * This does not eliminate the race because all vcpus are still online
1496 * when the syscore_ops will be called. But hopefully there is no work
1497 * pending at this point in time. Also the syscore_op is run last which
1498 * reduces the risk further.
1499 */
1500
1501static struct shared_info *xen_hvm_shared_info;
1502
1503static void xen_hvm_connect_shared_info(unsigned long pfn)
1504{ 1473{
1474 int cpu;
1505 struct xen_add_to_physmap xatp; 1475 struct xen_add_to_physmap xatp;
1476 static struct shared_info *shared_info_page = 0;
1506 1477
1478 if (!shared_info_page)
1479 shared_info_page = (struct shared_info *)
1480 extend_brk(PAGE_SIZE, PAGE_SIZE);
1507 xatp.domid = DOMID_SELF; 1481 xatp.domid = DOMID_SELF;
1508 xatp.idx = 0; 1482 xatp.idx = 0;
1509 xatp.space = XENMAPSPACE_shared_info; 1483 xatp.space = XENMAPSPACE_shared_info;
1510 xatp.gpfn = pfn; 1484 xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1511 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 1485 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1512 BUG(); 1486 BUG();
1513 1487
1514} 1488 HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1515static void xen_hvm_set_shared_info(struct shared_info *sip)
1516{
1517 int cpu;
1518
1519 HYPERVISOR_shared_info = sip;
1520 1489
1521 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 1490 /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1522 * page, we use it in the event channel upcall and in some pvclock 1491 * page, we use it in the event channel upcall and in some pvclock
1523 * related functions. We don't need the vcpu_info placement 1492 * related functions. We don't need the vcpu_info placement
1524 * optimizations because we don't use any pv_mmu or pv_irq op on 1493 * optimizations because we don't use any pv_mmu or pv_irq op on
1525 * HVM. 1494 * HVM.
1526 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is 1495 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1527 * online but xen_hvm_set_shared_info is run at resume time too and 1496 * online but xen_hvm_init_shared_info is run at resume time too and
1528 * in that case multiple vcpus might be online. */ 1497 * in that case multiple vcpus might be online. */
1529 for_each_online_cpu(cpu) { 1498 for_each_online_cpu(cpu) {
1530 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1499 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1531 } 1500 }
1532} 1501}
1533 1502
1534/* Reconnect the shared_info pfn to a mfn */ 1503#ifdef CONFIG_XEN_PVHVM
1535void xen_hvm_resume_shared_info(void)
1536{
1537 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1538}
1539
1540#ifdef CONFIG_KEXEC
1541static struct shared_info *xen_hvm_shared_info_kexec;
1542static unsigned long xen_hvm_shared_info_pfn_kexec;
1543
1544/* Remember a pfn in MMIO space for kexec reboot */
1545void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1546{
1547 xen_hvm_shared_info_kexec = sip;
1548 xen_hvm_shared_info_pfn_kexec = pfn;
1549}
1550
1551static void xen_hvm_syscore_shutdown(void)
1552{
1553 struct xen_memory_reservation reservation = {
1554 .domid = DOMID_SELF,
1555 .nr_extents = 1,
1556 };
1557 unsigned long prev_pfn;
1558 int rc;
1559
1560 if (!xen_hvm_shared_info_kexec)
1561 return;
1562
1563 prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1564 set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1565
1566 /* Move pfn to MMIO, disconnects previous pfn from mfn */
1567 xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1568
1569 /* Update pointers, following hypercall is also a memory barrier */
1570 xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1571
1572 /* Allocate new mfn for previous pfn */
1573 do {
1574 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1575 if (rc == 0)
1576 msleep(123);
1577 } while (rc == 0);
1578
1579 /* Make sure the previous pfn is really connected to a (new) mfn */
1580 BUG_ON(rc != 1);
1581}
1582
1583static struct syscore_ops xen_hvm_syscore_ops = {
1584 .shutdown = xen_hvm_syscore_shutdown,
1585};
1586#endif
1587
1588/* Use a pfn in RAM, may move to MMIO before kexec. */
1589static void __init xen_hvm_init_shared_info(void)
1590{
1591 /* Remember pointer for resume */
1592 xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1593 xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1594 xen_hvm_set_shared_info(xen_hvm_shared_info);
1595}
1596
1597static void __init init_hvm_pv_info(void) 1504static void __init init_hvm_pv_info(void)
1598{ 1505{
1599 int major, minor; 1506 int major, minor;
@@ -1644,9 +1551,6 @@ static void __init xen_hvm_guest_init(void)
1644 init_hvm_pv_info(); 1551 init_hvm_pv_info();
1645 1552
1646 xen_hvm_init_shared_info(); 1553 xen_hvm_init_shared_info();
1647#ifdef CONFIG_KEXEC
1648 register_syscore_ops(&xen_hvm_syscore_ops);
1649#endif
1650 1554
1651 if (xen_feature(XENFEAT_hvm_callback_vector)) 1555 if (xen_feature(XENFEAT_hvm_callback_vector))
1652 xen_have_vector_callback = 1; 1556 xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b65a76133f4f..5141d808e751 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1283,7 +1283,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1284 1284
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1286 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1286 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI; 1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = start; 1288 args->op.arg1.linear_addr = start;
1289 } 1289 }
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 64effdc6da94..76ba0e97e530 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -194,6 +194,13 @@ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID
194 * boundary violation will require three middle nodes. */ 194 * boundary violation will require three middle nodes. */
195RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); 195RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
196 196
197/* When we populate back during bootup, the amount of pages can vary. The
198 * max we have is seen is 395979, but that does not mean it can't be more.
199 * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
200 * it can re-use Xen provided mfn_list array, so we only need to allocate at
201 * most three P2M top nodes. */
202RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
203
197static inline unsigned p2m_top_index(unsigned long pfn) 204static inline unsigned p2m_top_index(unsigned long pfn)
198{ 205{
199 BUG_ON(pfn >= MAX_P2M_PFN); 206 BUG_ON(pfn >= MAX_P2M_PFN);
@@ -570,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn)
570 } 577 }
571 return true; 578 return true;
572} 579}
580
581/*
582 * Skim over the P2M tree looking at pages that are either filled with
583 * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
584 * replace the P2M leaf with a p2m_missing or p2m_identity.
585 * Stick the old page in the new P2M tree location.
586 */
587bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
588{
589 unsigned topidx;
590 unsigned mididx;
591 unsigned ident_pfns;
592 unsigned inv_pfns;
593 unsigned long *p2m;
594 unsigned long *mid_mfn_p;
595 unsigned idx;
596 unsigned long pfn;
597
598 /* We only look when this entails a P2M middle layer */
599 if (p2m_index(set_pfn))
600 return false;
601
602 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
603 topidx = p2m_top_index(pfn);
604
605 if (!p2m_top[topidx])
606 continue;
607
608 if (p2m_top[topidx] == p2m_mid_missing)
609 continue;
610
611 mididx = p2m_mid_index(pfn);
612 p2m = p2m_top[topidx][mididx];
613 if (!p2m)
614 continue;
615
616 if ((p2m == p2m_missing) || (p2m == p2m_identity))
617 continue;
618
619 if ((unsigned long)p2m == INVALID_P2M_ENTRY)
620 continue;
621
622 ident_pfns = 0;
623 inv_pfns = 0;
624 for (idx = 0; idx < P2M_PER_PAGE; idx++) {
625 /* IDENTITY_PFNs are 1:1 */
626 if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
627 ident_pfns++;
628 else if (p2m[idx] == INVALID_P2M_ENTRY)
629 inv_pfns++;
630 else
631 break;
632 }
633 if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
634 goto found;
635 }
636 return false;
637found:
638 /* Found one, replace old with p2m_identity or p2m_missing */
639 p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
640 /* And the other for save/restore.. */
641 mid_mfn_p = p2m_top_mfn_p[topidx];
642 /* NOTE: Even if it is a p2m_identity it should still be point to
643 * a page filled with INVALID_P2M_ENTRY entries. */
644 mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
645
646 /* Reset where we want to stick the old page in. */
647 topidx = p2m_top_index(set_pfn);
648 mididx = p2m_mid_index(set_pfn);
649
650 /* This shouldn't happen */
651 if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
652 early_alloc_p2m(set_pfn);
653
654 if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
655 return false;
656
657 p2m_init(p2m);
658 p2m_top[topidx][mididx] = p2m;
659 mid_mfn_p = p2m_top_mfn_p[topidx];
660 mid_mfn_p[mididx] = virt_to_mfn(p2m);
661
662 return true;
663}
573bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) 664bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
574{ 665{
575 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 666 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
576 if (!early_alloc_p2m(pfn)) 667 if (!early_alloc_p2m(pfn))
577 return false; 668 return false;
578 669
670 if (early_can_reuse_p2m_middle(pfn, mfn))
671 return __set_phys_to_machine(pfn, mfn);
672
579 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) 673 if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
580 return false; 674 return false;
581 675
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ead85576d54a..d11ca11d14fc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
78 memblock_reserve(start, size); 78 memblock_reserve(start, size);
79 79
80 xen_max_p2m_pfn = PFN_DOWN(start + size); 80 xen_max_p2m_pfn = PFN_DOWN(start + size);
81 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
82 unsigned long mfn = pfn_to_mfn(pfn);
83
84 if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
85 continue;
86 WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
87 pfn, mfn);
81 88
82 for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
83 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 89 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
90 }
84} 91}
85 92
86static unsigned long __init xen_do_chunk(unsigned long start, 93static unsigned long __init xen_do_chunk(unsigned long start,
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index ae8a00c39de4..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM 31#ifdef CONFIG_XEN_PVHVM
32 int cpu; 32 int cpu;
33 xen_hvm_resume_shared_info(); 33 xen_hvm_init_shared_info();
34 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices(); 35 xen_unplug_emulated_devices();
36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1e4329e04e0f..202d4c150154 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
41void xen_vcpu_restore(void); 41void xen_vcpu_restore(void);
42 42
43void xen_callback_vector(void); 43void xen_callback_vector(void);
44void xen_hvm_resume_shared_info(void); 44void xen_hvm_init_shared_info(void);
45void xen_unplug_emulated_devices(void); 45void xen_unplug_emulated_devices(void);
46 46
47void __init xen_build_dynamic_phys_to_machine(void); 47void __init xen_build_dynamic_phys_to_machine(void);