From db023ea595015058270be6a62fe60a7b6b5c50d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 19:05:46 +0200 Subject: uprobes: Move clear_thread_flag(TIF_UPROBE) to uprobe_notify_resume() Move clear_thread_flag(TIF_UPROBE) from do_notify_resume() to uprobe_notify_resume() for !CONFIG_UPROBES case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/signal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908a376e..0041e5a5293b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -785,10 +785,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ - if (thread_info_flags & _TIF_UPROBE) { - clear_thread_flag(TIF_UPROBE); + if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); - } /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) -- cgit v1.2.2 From fcd8af585f587741c051f7124b8dee6c73c8629b Mon Sep 17 00:00:00 2001 From: David Hooper Date: Tue, 2 Oct 2012 15:26:53 +0100 Subject: x86/reboot: Remove quirk entry for SBC FITPC Remove the quirk for the SBC FITPC. It seems ot have been required when the default was kbd reboot, but no longer required now that the default is acpi reboot. Furthermore, BIOS reboot no longer works for this board as of 2.6.39 or any of the 3.x kernels. Signed-off-by: David Hooper Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20121002142635.17403.59959.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 52190a938b4a..4e8ba39eaf0f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -358,14 +358,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, - { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ - .callback = set_bios_reboot, - .ident = "CompuLab SBC-FITPC2", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), - DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), - }, - }, { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, .ident = "ASUS P4S800", -- cgit v1.2.2 From 961c79761dda351b5fb263a0654b98daac130b7a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 2 Oct 2012 11:34:09 +0300 Subject: x86/cache_info: Use ARRAY_SIZE() in amd_l3_attrs() Using ARRAY_SIZE() is more readable. Signed-off-by: Dan Carpenter Reviewed-by: Srivatsa S. Bhat Cc: Shai Fultheim Cc: Greg Kroah-Hartman Cc: Andreas Herrmann Link: http://lkml.kernel.org/r/20121002083409.GM12398@elgon.mountain Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 9a7c90d80bc4..93c5451bdd52 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -991,7 +991,7 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) if (attrs) return attrs; - n = sizeof (default_attrs) / sizeof (struct attribute *); + n = ARRAY_SIZE(default_attrs); if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) n += 2; -- cgit v1.2.2 From b64b9c937a533f0bfbfc9f6ac93d3c3e2f97ab02 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 29 Sep 2012 21:31:08 +0200 Subject: uprobes/x86: Only rep+nop can be emulated correctly __skip_sstep() correctly detects the "nontrivial" nop insns, but since it doesn't update regs->ip we can not really skip "0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0", the probed application is killed by SIGILL'ed handle_swbp(). Remove these additional checks. If we want to implement this correctly we need to know the full insn length to update ->ip. rep* + nop is fine even without updating ->ip. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 9538f00827a9..aafa5557b396 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -651,31 +651,19 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) /* * Skip these instructions as per the currently known x86 ISA. - * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } + * rep=0x66*; nop=0x90 */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { int i; for (i = 0; i < MAX_UINSN_BYTES; i++) { - if ((auprobe->insn[i] == 0x66)) + if (auprobe->insn[i] == 0x66) continue; if (auprobe->insn[i] == 0x90) return true; - if (i == (MAX_UINSN_BYTES - 1)) - break; - - if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f)) - return true; - - if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19)) - return true; - - if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0)) - return true; - break; } return false; -- cgit v1.2.2 From 20b279ddb38ca42f8863cec07b4d45ec24589f13 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Sep 2012 14:59:14 -0600 Subject: perf: Require exclude_guest to use PEBS - kernel side enforcement Intel PEBS in VT-x context uses the DS address as a guest linear address, even though its programmed by the host as a host linear address. This either results in guest memory corruption and or the hardware faulting and 'crashing' the virtual machine. Therefore we have to disable PEBS on VT-x enter and re-enable on VT-x exit, enforcing a strict exclude_guest. This patch enforces exclude_guest kernel side. Signed-off-by: Peter Zijlstra Cc: Avi Kivity Cc: David Ahern Cc: Gleb Natapov Cc: Ingo Molnar Cc: Robert Richter Link: http://lkml.kernel.org/r/1347569955-54626-3-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 915b876edd1e..3373f84d1397 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -338,6 +338,9 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; + + if (!attr->exclude_guest) + return -EOPNOTSUPP; } hwc->config |= config; @@ -380,6 +383,9 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { int precise = 0; + if (!event->attr.exclude_guest) + return -EOPNOTSUPP; + /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; -- cgit v1.2.2 From 1bbbbe779aabe1f0768c2bf8f8c0a5583679b54a Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Thu, 20 Oct 2011 16:15:26 -0500 Subject: x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct mapping. On systems with very large memory (1 TB in our case), BIOS may report a reserved region or a hole in the E820 map, even above the 4 GB range. Exclude these from the direct mapping. [ hpa: this should be done not just for > 4 GB but for everything above the legacy region (1 MB), at the very least. That, however, turns out to require significant restructuring. That work is well underway, but is not suitable for rc/stable. ] Cc: stable@kernel.org # > 2.6.32 Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1319145326-13902-1-git-send-email-jacob.shin@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80e1b95..198e774498ec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -919,8 +919,21 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<addr + ei->size <= 1UL << 32) + continue; + + if (ei->type == E820_RESERVED) + continue; + + max_pfn_mapped = init_memory_mapping( + ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, + ei->addr + ei->size); + } + /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } -- cgit v1.2.2 From 21c5e50e15b1abd797e62f18fd7f90b9cc004cbd Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 1 Oct 2012 14:42:05 +0800 Subject: x86, amd, mce: Avoid NULL pointer reference on CPU northbridge lookup When booting on a federated multi-server system (NumaScale), the processor Northbridge lookup returns NULL; add guards to prevent this causing an oops. On those systems, the northbridge is accessed through MMIO and the "normal" northbridge enumeration in amd_nb.c doesn't work since we're generating the northbridge ID from the initial APIC ID and the last is not unique on those systems. Long story short, we end up without northbridge descriptors. Signed-off-by: Daniel J Blueman Cc: stable@vger.kernel.org # 3.6 Link: http://lkml.kernel.org/r/1349073725-14093-1-git-send-email-daniel@numascale-asia.com [ Boris: beef up commit message ] Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index c4e916d77378..698b6ec12e0f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -576,12 +576,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int err = 0; if (shared_bank[bank]) { - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - WARN_ON(!nb); /* threshold descriptor already initialized on this node? */ - if (nb->bank4) { + if (nb && nb->bank4) { /* yes, use it */ b = nb->bank4; err = kobject_add(b->kobj, &dev->kobj, name); @@ -615,8 +613,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) atomic_set(&b->cpus, 1); /* nb is already initialized, see above */ - WARN_ON(nb->bank4); - nb->bank4 = b; + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } } err = allocate_threshold_blocks(cpu, bank, 0, -- cgit v1.2.2 From 5bc66170dc486556a1e36fd384463536573f4b82 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 18 Oct 2012 15:10:56 +0200 Subject: x86, MCE: Remove bios_cmci_threshold sysfs attribute 450cc201038f3 ("x86/mce: Provide boot argument to honour bios-set CMCI threshold") added the bios_cmci_threshold sysfs attribute which was supposed to communicate to userspace tools that BIOS CMCI threshold has been honoured. However, this info is not of any importance to userspace - it should rather get the actual error count it has been thresholded already from MCi_STATUS[38:52]. So drop this before it becomes a used interface (good thing we caught this early in 3.7-rc1, right after the merge window closed). Cc: Naveen N. Rao Acked-by: Tony Luck Link: http://lkml.kernel.org/r/20121017105940.GA14590@x1.osrc.amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 29e87d3b2843..46cbf8689692 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2209,11 +2209,6 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { &mce_cmci_disabled }; -static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { - __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), - &mce_bios_cmci_threshold -}; - static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, @@ -2222,7 +2217,6 @@ static struct device_attribute *mce_device_attrs[] = { &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, - &dev_attr_bios_cmci_threshold.attr, NULL }; -- cgit v1.2.2 From a349e23d1cf746f8bdc603dcc61fae9ee4a695f6 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 19 Oct 2012 17:29:07 +0100 Subject: xen/x86: don't corrupt %eip when returning from a signal handler In 32 bit guests, if a userspace process has %eax == -ERESTARTSYS (-512) or -ERESTARTNOINTR (-513) when it is interrupted by an event /and/ the process has a pending signal then %eip (and %eax) are corrupted when returning to the main process after handling the signal. The application may then crash with SIGSEGV or a SIGILL or it may have subtly incorrect behaviour (depending on what instruction it returned to). The occurs because handle_signal() is incorrectly thinking that there is a system call that needs to restarted so it adjusts %eip and %eax to re-execute the system call instruction (even though user space had not done a system call). If %eax == -514 (-ERESTARTNOHAND (-514) or -ERESTART_RESTARTBLOCK (-516) then handle_signal() only corrupted %eax (by setting it to -EINTR). This may cause the application to crash or have incorrect behaviour. handle_signal() assumes that regs->orig_ax >= 0 means a system call so any kernel entry point that is not for a system call must push a negative value for orig_ax. For example, for physical interrupts on bare metal the inverse of the vector is pushed and page_fault() sets regs->orig_ax to -1, overwriting the hardware provided error code. xen_hypervisor_callback() was incorrectly pushing 0 for orig_ax instead of -1. Classic Xen kernels pushed %eax which works as %eax cannot be both non-negative and -RESTARTSYS (etc.), but using -1 is consistent with other non-system call entry points and avoids some of the tests in handle_signal(). There were similar bugs in xen_failsafe_callback() of both 32 and 64-bit guests. If the fault was corrected and the normal return path was used then 0 was incorrectly pushed as the value for orig_ax. Signed-off-by: David Vrabel Acked-by: Jan Beulich Acked-by: Ian Campbell Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/entry_32.S | 8 +++++--- arch/x86/kernel/entry_64.S | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f28837476..8f8e8eea9085 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1016,7 +1016,7 @@ ENTRY(xen_sysenter_target) ENTRY(xen_hypervisor_callback) CFI_STARTPROC - pushl_cfi $0 + pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF @@ -1058,14 +1058,16 @@ ENTRY(xen_failsafe_callback) 2: mov 8(%esp),%es 3: mov 12(%esp),%fs 4: mov 16(%esp),%gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ testl %eax,%eax popl_cfi %eax lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp - jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) -5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) + jmp iret_exc +5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp ret_from_exception CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8c834f..dcdd0ea33a32 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1363,7 +1363,7 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 + pushq_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp error_exit CFI_ENDPROC -- cgit v1.2.2 From a05123bdd1b9ba961ed262864924a5b3ee81afe8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 21 Aug 2012 17:08:37 +0800 Subject: perf/x86: Disable uncore on virtualized CPUs Initializing uncore PMU on virtualized CPU may hang the kernel. This is because kvm does not emulate the entire hardware. Thers are lots of uncore related MSRs, making kvm enumerate them all is a non-trival task. So just disable uncore on virtualized CPU. Signed-off-by: Yan, Zheng Tested-by: Pekka Enberg Cc: a.p.zijlstra@chello.nl Cc: eranian@google.com Cc: andi@firstfloor.org Cc: avi@redhat.com Link: http://lkml.kernel.org/r/1345540117-14164-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 99d96a4978b5..5df8d32ba91e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2926,6 +2926,9 @@ static int __init intel_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; + if (cpu_has_hypervisor) + return -ENODEV; + ret = uncore_pci_init(); if (ret) goto fail; -- cgit v1.2.2 From c5e015d4949aa665c486cae6884beb00b97e3dea Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 19 Oct 2012 12:11:55 -0400 Subject: KVM guest: exit idleness when handling KVM_PV_REASON_PAGE_NOT_PRESENT KVM_PV_REASON_PAGE_NOT_PRESENT kicks cpu out of idleness, but we haven't marked that spot as an exit from idleness. Not doing so can cause RCU warnings such as: [ 732.788386] =============================== [ 732.789803] [ INFO: suspicious RCU usage. ] [ 732.790032] 3.7.0-rc1-next-20121019-sasha-00002-g6d8d02d-dirty #63 Tainted: G W [ 732.790032] ------------------------------- [ 732.790032] include/linux/rcupdate.h:738 rcu_read_lock() used illegally while idle! [ 732.790032] [ 732.790032] other info that might help us debug this: [ 732.790032] [ 732.790032] [ 732.790032] RCU used illegally from idle CPU! [ 732.790032] rcu_scheduler_active = 1, debug_locks = 1 [ 732.790032] RCU used illegally from extended quiescent state! [ 732.790032] 2 locks held by trinity-child31/8252: [ 732.790032] #0: (&rq->lock){-.-.-.}, at: [] __schedule+0x178/0x8f0 [ 732.790032] #1: (rcu_read_lock){.+.+..}, at: [] cpuacct_charge+0xe/0x200 [ 732.790032] [ 732.790032] stack backtrace: [ 732.790032] Pid: 8252, comm: trinity-child31 Tainted: G W 3.7.0-rc1-next-20121019-sasha-00002-g6d8d02d-dirty #63 [ 732.790032] Call Trace: [ 732.790032] [] lockdep_rcu_suspicious+0x10b/0x120 [ 732.790032] [] cpuacct_charge+0x90/0x200 [ 732.790032] [] ? cpuacct_charge+0xe/0x200 [ 732.790032] [] update_curr+0x1a3/0x270 [ 732.790032] [] dequeue_entity+0x2a/0x210 [ 732.790032] [] dequeue_task_fair+0x45/0x130 [ 732.790032] [] dequeue_task+0x89/0xa0 [ 732.790032] [] deactivate_task+0x1e/0x20 [ 732.790032] [] __schedule+0x879/0x8f0 [ 732.790032] [] ? trace_hardirqs_off+0xd/0x10 [ 732.790032] [] ? kvm_async_pf_task_wait+0x1d5/0x2b0 [ 732.790032] [] schedule+0x55/0x60 [ 732.790032] [] kvm_async_pf_task_wait+0x1f4/0x2b0 [ 732.790032] [] ? abort_exclusive_wait+0xb0/0xb0 [ 732.790032] [] ? prepare_to_wait+0x25/0x90 [ 732.790032] [] do_async_page_fault+0x56/0xa0 [ 732.790032] [] async_page_fault+0x28/0x30 Signed-off-by: Sasha Levin Acked-by: Gleb Natapov Acked-by: Paul E. McKenney Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b3e5e51bc907..4180a874c764 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -247,7 +247,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ + rcu_irq_enter(); + exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); + rcu_irq_exit(); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); -- cgit v1.2.2 From bffd5fc26043cce33158d4e027576e79fab2f7bb Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 9 Oct 2012 17:38:35 +0200 Subject: x86/perf: Fix virtualization sanity check In check_hw_exists() we try to detect non-emulated MSR accesses by writing an arbitrary value into one of the PMU registers and check if it's value after a readout is still the same. This algorithm silently assumes that the register does not contain the magic value already, which is wrong in at least one situation. Fix the algorithm to really do a read-modify-write cycle. This fixes a warning under Xen under some circumstances on AMD family 10h CPUs. The reasons in more details actually sound like a story from Believe It or Not!: First you need an AMD family 10h/12h CPU. These do not reset the PERF_CTR registers on a reboot. Now you boot bare metal Linux, which goes successfully through this check, but leaves the magic value of 0xabcd in the register. You don't use the performance counters, but do a reboot (warm reset). Then you choose to boot Xen. The check will be triggered with a recent Linux kernel as Dom0 again, trying to write 0xabcd into the MSR. Xen silently drops the write (expected), but the subsequent read will return the value in the register, which just happens to be the expected magic value. Thus the test misleadingly succeeds, leaving the kernel in the belief that the PMU is available. This will trigger the following message: [ 0.020294] ------------[ cut here ]------------ [ 0.020311] WARNING: at arch/x86/xen/enlighten.c:730 xen_apic_write+0x15/0x17() [ 0.020318] Hardware name: empty [ 0.020323] Modules linked in: [ 0.020334] Pid: 1, comm: swapper/0 Not tainted 3.3.8 #7 [ 0.020340] Call Trace: [ 0.020354] [] warn_slowpath_common+0x80/0x98 [ 0.020369] [] warn_slowpath_null+0x15/0x17 [ 0.020378] [] xen_apic_write+0x15/0x17 [ 0.020392] [] perf_events_lapic_init+0x2e/0x30 [ 0.020410] [] init_hw_perf_events+0x250/0x407 [ 0.020419] [] ? check_bugs+0x2d/0x2d [ 0.020430] [] do_one_initcall+0x7a/0x131 [ 0.020444] [] kernel_init+0x91/0x15d [ 0.020456] [] kernel_thread_helper+0x4/0x10 [ 0.020471] [] ? retint_restore_args+0x5/0x6 [ 0.020481] [] ? gs_change+0x13/0x13 [ 0.020500] ---[ end trace a7919e7f17c0a725 ]--- The new code will change every of the 16 low bits read from the register and tries to write and read-back that modified number from the MSR. Signed-off-by: Andre Przywara Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Avi Kivity Link: http://lkml.kernel.org/r/1349797115-28346-2-git-send-email-andre.przywara@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3373f84d1397..4a3374e61a93 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -208,12 +208,14 @@ static bool check_hw_exists(void) } /* - * Now write a value and read it back to see if it matches, - * this is needed to detect certain hardware emulators (qemu/kvm) - * that don't trap on the MSR access and always return 0s. + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - val = 0xabcdUL; reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) -- cgit v1.2.2 From 7991c9ca40d3127dd2ffa3a9c1e33f7d4005495a Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:30:01 -0400 Subject: perf/x86: Fix P6 FP_ASSIST event constraint According to Intel SDM Volume 3B, FP_ASSIST is limited to Counter 1 only, not Counter 0. Tested on a Pentium II. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191728570.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index e4dd0f7a0453..0ff5f7fb64cd 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -34,7 +34,7 @@ static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ -- cgit v1.2.2 From e09df47885d767e418902067ce1885aafa3b27db Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:31:54 -0400 Subject: perf/x86: Update/fix generic events on P6 PMU This patch updates the generic events on p6, including some new extended cache events. Values for these events were taken from the equivelant PAPI predefined events. Tested on a Pentium II. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191730080.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 111 +++++++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 0ff5f7fb64cd..9582fcbcd8ec 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -8,13 +8,106 @@ */ static const u64 p6_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static __initconst u64 p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p6_pmu_event_map(int hw_event) @@ -158,5 +251,9 @@ __init int p6_pmu_init(void) x86_pmu = p6_pmu; + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; } -- cgit v1.2.2 From 58e9eaf06f5476cb2192ec1d012674ce5e79dd21 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:33:38 -0400 Subject: perf/x86: Remove P6 cpuc->enabled check Between 2.6.33 and 2.6.34 the PMU code was made modular. The x86_pmu_enable() call was extended to disable cpuc->enabled and iterate the counters, enabling one at a time, before calling enable_all() at the end, followed by re-enabling cpuc->enabled. Since cpuc->enabled was set to 0, that change effectively caused the "val |= ARCH_PERFMON_EVENTSEL_ENABLE;" code in p6_pmu_enable_event() and p6_pmu_disable_event() to be dead code that was never called. This change removes this code (which was confusing) and adds some extra commentary to make it more clear what is going on. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191732000.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 9582fcbcd8ec..7d0270bd793e 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -157,25 +157,25 @@ static void p6_pmu_enable_all(int added) static inline void p6_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ (void)wrmsrl_safe(hwc->config_base, val); } -- cgit v1.2.2 From 032c3851f51141e30de02ed0bc50a7743dfd776d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 24 Oct 2012 16:42:20 +0800 Subject: perf/x86/uncore: Handle pci_read_config_dword() errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This, beyond handling corner cases, also fixes some build warnings: arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_disable_box’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:124:9: warning: ‘config’ is used uninitialized in this function [-Wuninitialized] arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_enable_box’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:135:9: warning: ‘config’ is used uninitialized in this function [-Wuninitialized] arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_read_counter’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:164:2: warning: ‘count’ is used uninitialized in this function [-Wuninitialized] Signed-off-by: Yan, Zheng Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/1351068140-13456-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 43 +++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 5df8d32ba91e..6f874771ee65 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -118,22 +118,24 @@ static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -156,7 +158,7 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; - u64 count; + u64 count = 0; pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); @@ -603,11 +605,12 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static void snbep_pci2phy_map_init(void) +static int snbep_pci2phy_map_init(void) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid; - u32 config; + int err = 0; + u32 config = 0; while (1) { /* find the UBOX device */ @@ -618,10 +621,14 @@ static void snbep_pci2phy_map_init(void) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; nodeid = config; /* get the Node ID mapping */ - pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; /* * every three bits in the Node ID mapping register maps * to a particular node. @@ -633,7 +640,11 @@ static void snbep_pci2phy_map_init(void) } } }; - return; + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; } /* end of Sandy Bridge-EP uncore support */ @@ -2578,9 +2589,11 @@ static int __init uncore_pci_init(void) switch (boot_cpu_data.x86_model) { case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(); + if (ret) + return ret; pci_uncores = snbep_pci_uncores; uncore_pci_driver = &snbep_uncore_pci_driver; - snbep_pci2phy_map_init(); break; default: return 0; -- cgit v1.2.2 From ae5ba47a990a18c869d66916fd72fb334c45cf91 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:03:21 -0400 Subject: perf/x86: Make Intel KNC use full 40-bit width of counters Early versions of Intel KNC chips have a bug where bits above 32 were not properly set. We worked around this by only using the bottom 32 bits (out of 40 that should be available). It turns out this workaround breaks overflow handling. The buggy silicon will in theory never be used in production systems, so remove this workaround so we get proper overflow support. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171302140.23243@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 7c46bfdbc373..73bcfbdedd50 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -226,12 +226,11 @@ static __initconst struct x86_pmu knc_pmu = { .event_map = knc_pmu_event_map, .max_events = ARRAY_SIZE(knc_perfmon_event_map), .apic = 1, - .max_period = (1ULL << 31) - 1, + .max_period = (1ULL << 39) - 1, .version = 0, .num_counters = 2, - /* in theory 40 bits, early silicon is buggy though */ - .cntval_bits = 32, - .cntval_mask = (1ULL << 32) - 1, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = knc_event_constraints, .format_attrs = intel_knc_formats_attr, -- cgit v1.2.2 From 7d011962afbaa6e572cd8e0dbb7abf773e166e64 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:04:33 -0400 Subject: perf/x86: Remove cpuc->enable check on Intl KNC event enable/disable x86_pmu.enable() is called from x86_pmu_enable() with cpuc->enabled set to 0. This means we weren't re-enabling the counters after a context switch. This patch just removes the check, as it should't be necessary (and the equivelent x86_ generic code does not have the checks). The origin of this problem is the KNC driver being based on the P6 one. The P6 driver also has this issue, but works anyway due to various lucky accidents. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Cc: Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171303290.23243@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 73bcfbdedd50..f3a2af41552d 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -173,26 +173,22 @@ static void knc_pmu_enable_all(int added) static inline void knc_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } static void knc_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } -- cgit v1.2.2 From e4074b3049f99c6ad6e1a33e6d93d8ec0652e2c1 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:05:45 -0400 Subject: perf/x86: Enable overflow on Intel KNC with a custom knc_pmu_handle_irq() Although based on the Intel P6 design, the interrupt mechnanism for KNC more closely resembles the Intel architectural perfmon one. We can't just re-use that code though, because KNC has different MSR numbers for the status and ack registers. In this case we just cut-and paste from perf_event_intel.c with some minor changes, as it looks like it would not be worth the trouble to change that code to be MSR-configurable. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171304410.23243@vincent-weaver-1.um.maine.edu [ Small stylistic edits. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 78 +++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index f3a2af41552d..4b7731bf23a8 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -3,6 +3,8 @@ #include #include +#include + #include "perf_event.h" static const u64 knc_perfmon_event_map[] = @@ -193,6 +195,80 @@ static void knc_pmu_enable_event(struct perf_event *event) (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -210,7 +286,7 @@ static struct attribute *intel_knc_formats_attr[] = { static __initconst struct x86_pmu knc_pmu = { .name = "knc", - .handle_irq = x86_pmu_handle_irq, + .handle_irq = knc_pmu_handle_irq, .disable_all = knc_pmu_disable_all, .enable_all = knc_pmu_enable_all, .enable = knc_pmu_enable_event, -- cgit v1.2.2 From 64dfab8e83644902ad2fd559a56c411b47e3ef3c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 22 Oct 2012 16:51:38 +0800 Subject: perf/x86: Remove unused variable in nhmex_rbox_alter_er() The variable port is initialized but never used otherwise, so remove the unused variable. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Cc: Yan, Zheng Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/CAPgLHd8NZkYSkZm22FpZxiEh6HcA0q-V%3D29vdnheiDhgrJZ%2Byw@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 6f874771ee65..3cf3d97cce3a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1558,7 +1558,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int port; /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { @@ -1570,7 +1569,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) } /* adjust extra register config */ - port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { case 2: /* shift the 8~15 bits to the 0~7 bits */ -- cgit v1.2.2 From 94777fc51b3ad85ff9f705ddf7cdd0eb3bbad5a6 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 16 Oct 2012 07:50:21 -0500 Subject: x86/irq/ioapic: Check for valid irq_cfg pointer in smp_irq_move_cleanup_interrupt Posting this patch to fix an issue concerning sparse irq's that I raised a while back. There was discussion about adding refcounting to sparse irqs (to fix other potential race conditions), but that does not appear to have been addressed yet. This covers the only issue of this type that I've encountered in this area. A NULL pointer dereference can occur in smp_irq_move_cleanup_interrupt() if we haven't yet setup the irq_cfg pointer in the irq_desc.irq_data.chip_data. In create_irq_nr() there is a window where we have set vector_irq in __assign_irq_vector(), but not yet called irq_set_chip_data() to set the irq_cfg pointer. Should an IRQ_MOVE_CLEANUP_VECTOR hit the cpu in question during this time, smp_irq_move_cleanup_interrupt() will attempt to process the aforementioned irq, but panic when accessing irq_cfg. Only continue processing the irq if irq_cfg is non-NULL. Signed-off-by: Dimitri Sivanich Cc: Suresh Siddha Cc: Joerg Roedel Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/20121016125021.GA22935@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2cd..1817fa911024 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2257,6 +2257,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); + if (!cfg) + continue; + raw_spin_lock(&desc->lock); /* -- cgit v1.2.2 From 6ede1fd3cb404c0016de6ac529df46d561bd558b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Trim memory in memblock to be page aligned We will not map partial pages, so need to make sure memblock allocation will not allocate those bytes out. Also we will use for_each_mem_pfn_range() to loop to map memory range to keep them consistent. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/e820.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ed858e9e9a74..df06ade26bef 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1077,6 +1077,9 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } + /* throw away partial pages */ + memblock_trim_memory(PAGE_SIZE); + memblock_dump_all(); } -- cgit v1.2.2 From 1f2ff682ac951ed82cc043cf140d2851084512df Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Use memblock memory loop instead of e820_RAM We need to handle E820_RAM and E820_RESERVED_KERNEL at the same time. Also memblock has page aligned range for ram, so we could avoid mapping partial pages. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/setup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 468e98dfd44e..5d888af8682a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -921,18 +921,19 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + unsigned long start_pfn, end_pfn; - if (ei->addr + ei->size <= 1UL << 32) - continue; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, + NULL) { - if (ei->type == E820_RESERVED) + end = PFN_PHYS(end_pfn); + if (end <= (1UL<<32)) continue; + start = PFN_PHYS(start_pfn); max_pfn_mapped = init_memory_mapping( - ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, - ei->addr + ei->size); + max((1UL<<32), start), end); } /* can we preseve max_low_pfn ?*/ -- cgit v1.2.2 From 5189c2a7c7769ee9d037d76c1a7b8550ccf3481c Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Wed, 24 Oct 2012 10:00:44 -0700 Subject: x86: efi: Turn off efi_enabled after setup on mixed fw/kernel When 32-bit EFI is used with 64-bit kernel (or vice versa), turn off efi_enabled once setup is done. Beyond setup, it is normally used to determine if runtime services are available and we will have none. This will resolve issues stemming from efivars modprobe panicking on a 32/64-bit setup, as well as some reboot issues on similar setups. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=45991 Reported-by: Marko Kohtala Reported-by: Maxim Kammerer Signed-off-by: Olof Johansson Acked-by: Maarten Lankhorst Cc: stable@kernel.org # 3.4 - 3.6 Cc: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/kernel/setup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a2bb18e02839..6fd7752cee96 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1035,6 +1035,18 @@ void __init setup_arch(char **cmdline_p) arch_init_ideal_nops(); register_refined_jiffies(CLOCK_TICK_RATE); + +#ifdef CONFIG_EFI + /* Once setup is done above, disable efi_enabled on mismatched + * firmware/kernel archtectures since there is no support for + * runtime services. + */ + if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + efi_enabled = 0; + } +#endif } #ifdef CONFIG_X86_32 -- cgit v1.2.2